166 files changed, 11304 insertions, 5942 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eca595e2fd52..f70396e5a24b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
 # Makefile for the linux kernel.
 #
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
+obj-y     = fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
+            notifier.o ksysfs.o cred.o \
-            async.o range.o
+            async.o range.o groups.o
-obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
 CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
+obj-y += sched/
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,8 +99,8 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
-obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_PERF_EVENTS) += events/
@@ -109,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
-ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
-# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-# needed for x86 only.  Why this used to be enabled for all architectures is beyond
-# me.  I suspect most platforms don't need this, but until we know that for sure
-# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
-# to get a correct value for the wait-channel (WCHAN in ps). --davidm
-CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
-endif
 $(obj)/configs.o: $(obj)/config_data.h
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..02e6167a53b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 * the cache line to have the data after getting the lock.
 */
 struct bsd_acct_struct {
-        volatile int            active;
+        int                     active;
-        volatile int            needcheck;
+        unsigned long           needcheck;
        struct file             *file;
        struct pid_namespace    *ns;
-        struct timer_list       timer;
        struct list_head        list;
 };
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
 static LIST_HEAD(acct_list);
 /*
- * Called whenever the timer says to check the free space.
- */
-static void acct_timeout(unsigned long x)
-{
-        struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
-        acct->needcheck = 1;
-}
-/*
 * Check the amount of free space and suspend/resume accordingly.
 */
 static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        struct kstatfs sbuf;
        int res;
        int act;
-        sector_t resume;
+        u64 resume;
-        sector_t suspend;
+        u64 suspend;
        spin_lock(&acct_lock);
        res = acct->active;
-        if (!file || !acct->needcheck)
+        if (!file || time_is_before_jiffies(acct->needcheck))
                goto out;
        spin_unlock(&acct_lock);
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        suspend = sbuf.f_blocks * SUSPEND;
        resume = sbuf.f_blocks * RESUME;
-        sector_div(suspend, 100);
+        do_div(suspend, 100);
-        sector_div(resume, 100);
+        do_div(resume, 100);
        if (sbuf.f_bavail <= suspend)
                act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
                }
        }
-        del_timer(&acct->timer);
+        acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-        acct->needcheck = 0;
-        acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-        add_timer(&acct->timer);
        res = acct->active;
 out:
        spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        if (acct->file) {
                old_acct = acct->file;
                old_ns = acct->ns;
-                del_timer(&acct->timer);
                acct->active = 0;
-                acct->needcheck = 0;
                acct->file = NULL;
                acct->ns = NULL;
                list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
        if (file) {
                acct->file = file;
                acct->ns = ns;
-                acct->needcheck = 0;
+                acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
                acct->active = 1;
                list_add(&acct->list, &acct_list);
-                /* It's been deleted if it was used before so this is safe */
-                setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
-                acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-                add_timer(&acct->timer);
        }
        if (old_acct) {
                mnt_unpin(old_acct->f_path.mnt);
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
        spin_lock(&acct_lock);
 restart:
        list_for_each_entry(acct, &acct_list, list)
-                if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+                if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
                        acct_file_reopen(acct, NULL, NULL);
                        goto restart;
                }
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
        if (acct == NULL)
                return;
-        del_timer_sync(&acct->timer);
        spin_lock(&acct_lock);
        if (acct->file != NULL)
                acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
         * Fill the accounting struct with the needed info as recorded
         * by the different kernel functions.
         */
-        memset((caddr_t)&ac, 0, sizeof(acct_t));
+        memset(&ac, 0, sizeof(acct_t));
        ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
        strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead)
                pacct->ac_flag |= ACORE;
        if (current->flags & PF_SIGNALED)
                pacct->ac_flag |= AXSIG;
-        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+        pacct->ac_utime += current->utime;
-        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+        pacct->ac_stime += current->stime;
        pacct->ac_minflt += current->min_flt;
        pacct->ac_majflt += current->maj_flt;
        spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index d5fe7af0de2e..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/async.h>
 #include <linux/atomic.h>
 #include <linux/ktime.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
 static atomic_t entry_count;
-extern int initcall_debug;
 /*
 * MUST be called with the lock held!
@@ -120,7 +118,7 @@ static void async_run_entry_fn(struct work_struct *work)
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
        unsigned long flags;
-        ktime_t calltime, delta, rettime;
+        ktime_t uninitialized_var(calltime), delta, rettime;
        /* 1) move self to the running queue */
        spin_lock_irqsave(&async_lock, flags);
@@ -269,7 +267,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 void async_synchronize_cookie_domain(async_cookie_t cookie,
                                     struct list_head *running)
 {
-        ktime_t starttime, delta, endtime;
+        ktime_t uninitialized_var(starttime), delta, endtime;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index f3ba55fa0b70..57e3f5107937 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -45,7 +45,7 @@
 #include <asm/types.h>
 #include <linux/atomic.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                avail = audit_expand(ab,
                        max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
                if (!avail)
-                        goto out;
+                        goto out_va_end;
                len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
        }
-        va_end(args2);
        if (len > 0)
                skb_put(skb, len);
+out_va_end:
+        va_end(args2);
 out:
        return;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce4b054acee5..e7fe2b0d29b3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -48,7 +48,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
@@ -210,12 +210,12 @@ struct audit_context {
                struct {
                        uid_t                   uid;
                        gid_t                   gid;
-                        mode_t                  mode;
+                        umode_t                 mode;
                        u32                     osid;
                        int                     has_perm;
                        uid_t                   perm_uid;
                        gid_t                   perm_gid;
-                        mode_t                  perm_mode;
+                        umode_t                 perm_mode;
                        unsigned long           qbytes;
                } ipc;
                struct {
@@ -234,7 +234,7 @@ struct audit_context {
                } mq_sendrecv;
                struct {
                        int                     oflag;
-                        mode_t                  mode;
+                        umode_t                 mode;
                        struct mq_attr          attr;
                } mq_open;
                struct {
@@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 static int audit_match_filetype(struct audit_context *ctx, int which)
 {
        unsigned index = which & ~S_IFMT;
-        mode_t mode = which & S_IFMT;
+        umode_t mode = which & S_IFMT;
        if (unlikely(!ctx))
                return 0;
@@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic)
        case AUDIT_IPC: {
                u32 osid = context->ipc.osid;
-                audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
+                audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
                         context->ipc.uid, context->ipc.gid, context->ipc.mode);
                if (osid) {
                        char *ctx = NULL;
@@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic)
                        ab = audit_log_start(context, GFP_KERNEL,
                                             AUDIT_IPC_SET_PERM);
                        audit_log_format(ab,
-                                "qbytes=%lx ouid=%u ogid=%u mode=%#o",
+                                "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
                                context->ipc.qbytes,
                                context->ipc.perm_uid,
                                context->ipc.perm_gid,
@@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic)
                break; }
        case AUDIT_MQ_OPEN: {
                audit_log_format(ab,
-                        "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+                        "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
                        "mq_msgsize=%ld mq_curmsgs=%ld",
                        context->mq_open.oflag, context->mq_open.mode,
                        context->mq_open.attr.mq_flags,
@@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                if (n->ino != (unsigned long)-1) {
                        audit_log_format(ab, " inode=%lu"
-                                         " dev=%02x:%02x mode=%#o"
+                                         " dev=%02x:%02x mode=%#ho"
                                         " ouid=%u ogid=%u rdev=%02x:%02x",
                                         n->ino,
                                         MAJOR(n->dev),
@@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 * @attr: queue attributes
 *
 */
-void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
+void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
        struct audit_context *context = current->audit_context;
@@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 *
 * Called only after audit_ipc_obj().
 */
-void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
 {
        struct audit_context *context = current->audit_context;
diff --git a/kernel/capability.c b/kernel/capability.c
index 74fb3b603045..0fcf1c14a297 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,7 +10,7 @@
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ceea95d..a5d3b5325f77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
 #include <linux/atomic.h>
+/*
+ * cgroup_mutex is the master lock.  Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
+ * release_agent_path and so on.  Modifying requires both cgroup_mutex and
+ * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
+ * break the following locking order cycle.
+ *
+ *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
+ *  B. namespace_sem -> cgroup_mutex
+ *
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
+ * breaks it.
+ */
 static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_MUTEX(cgroup_root_mutex);
 /*
 * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -265,7 +282,7 @@ list_for_each_entry(_root, &roots, root_list)
 /* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
 static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 * -> cgroup_mkdir.
 */
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 static int alloc_css_id(struct cgroup_subsys *ss,
                        struct cgroup *parent, struct cgroup *child);
-static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
+static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 *
 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
 */
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
 {
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        int i;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
+        BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
        removed_bits = root->actual_subsys_bits & ~final_bits;
        added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        return 0;
 }
-static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-        struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
        struct cgroup_subsys *ss;
-        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        for_each_subsys(root, ss)
                seq_printf(seq, ",%s", ss->name);
        if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
-        mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&cgroup_root_mutex);
        return 0;
 }
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        /*
         * If the 'all' option was specified select all the subsystems,
-         * otherwise 'all, 'none' and a subsystem name options were not
+         * otherwise if 'none', 'name=' and a subsystem name options
-         * specified, let's default to 'all'
+         * were not specified, let's default to 'all'
         */
-        if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+        if (all_ss || (!one_ss && !opts->none && !opts->name)) {
                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        /* See what subsystems are wanted */
        ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 out_unlock:
        kfree(opts.release_agent);
        kfree(opts.name);
+        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        int ret = 0;
        struct super_block *sb;
        struct cgroupfs_root *new_root;
+        struct inode *inode;
        /* First find the desired set of subsystems */
        mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* We used the new root structure, so this is a new hierarchy */
                struct list_head tmp_cg_links;
                struct cgroup *root_cgrp = &root->top_cgroup;
-                struct inode *inode;
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
+                mutex_lock(&cgroup_root_mutex);
-                if (strlen(root->name)) {
+                /* Check for name clashes with existing mounts */
-                        /* Check for name clashes with existing mounts */
+                ret = -EBUSY;
-                        for_each_active_root(existing_root) {
+                if (strlen(root->name))
-                                if (!strcmp(existing_root->name, root->name)) {
+                        for_each_active_root(existing_root)
-                                        ret = -EBUSY;
+                                if (!strcmp(existing_root->name, root->name))
-                                        mutex_unlock(&cgroup_mutex);
+                                        goto unlock_drop;
-                                        mutex_unlock(&inode->i_mutex);
-                                        goto drop_new_super;
-                                }
-                        }
-                }
                /*
                 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * have some link structures left over
                 */
                ret = allocate_cg_links(css_set_count, &tmp_cg_links);
-                if (ret) {
+                if (ret)
-                        mutex_unlock(&cgroup_mutex);
+                        goto unlock_drop;
-                        mutex_unlock(&inode->i_mutex);
-                        goto drop_new_super;
-                }
                ret = rebind_subsystems(root, root->subsys_bits);
                if (ret == -EBUSY) {
-                        mutex_unlock(&cgroup_mutex);
-                        mutex_unlock(&inode->i_mutex);
                        free_cg_links(&tmp_cg_links);
-                        goto drop_new_super;
+                        goto unlock_drop;
                }
                /*
                 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                cred = override_creds(&init_cred);
                cgroup_populate_dir(root_cgrp);
                revert_creds(cred);
+                mutex_unlock(&cgroup_root_mutex);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
        } else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        kfree(opts.name);
        return dget(sb->s_root);
+ unlock_drop:
+        mutex_unlock(&cgroup_root_mutex);
+        mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&inode->i_mutex);
 drop_new_super:
        deactivate_locked_super(sb);
 drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
        BUG_ON(!list_empty(&cgrp->sibling));
        mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_root_mutex);
        /* Rebind all subsystems back to the default hierarchy */
        ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
                root_count--;
        }
+        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 EXPORT_SYMBOL_GPL(cgroup_path);
 /*
+ * Control Group taskset
+ */
+struct task_and_cgroup {
+        struct task_struct      *task;
+        struct cgroup           *cgrp;
+};
+struct cgroup_taskset {
+        struct task_and_cgroup  single;
+        struct flex_array       *tc_array;
+        int                     tc_array_len;
+        int                     idx;
+        struct cgroup           *cur_cgrp;
+};
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+        if (tset->tc_array) {
+                tset->idx = 0;
+                return cgroup_taskset_next(tset);
+        } else {
+                tset->cur_cgrp = tset->single.cgrp;
+                return tset->single.task;
+        }
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset.  Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+        struct task_and_cgroup *tc;
+        if (!tset->tc_array || tset->idx >= tset->tc_array_len)
+                return NULL;
+        tc = flex_array_get(tset->tc_array, tset->idx++);
+        tset->cur_cgrp = tc->cgrp;
+        return tc->task;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+/**
+ * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * @tset: taskset of interest
+ *
+ * Return the cgroup for the current (last returned) task of @tset.  This
+ * function must be preceded by either cgroup_taskset_first() or
+ * cgroup_taskset_next().
+ */
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+{
+        return tset->cur_cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+/**
+ * cgroup_taskset_size - return the number of tasks in taskset
+ * @tset: taskset of interest
+ */
+int cgroup_taskset_size(struct cgroup_taskset *tset)
+{
+        return tset->tc_array ? tset->tc_array_len : 1;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_size);
+/*
 * cgroup_task_migrate - move a task from one cgroup to another.
 *
 * 'guarantee' is set if the caller promises that a new css_set for the task
 * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
 */
 static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
                               struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
        struct css_set *newcg;
        /*
-         * get old css_set. we need to take task_lock and refcount it, because
+         * We are synchronized through threadgroup_lock() against PF_EXITING
-         * an exiting task can change its css_set to init_css_set and drop its
+         * setting such that we can't race against cgroup_exit() changing the
-         * old one without taking cgroup_mutex.
+         * css_set to init_css_set and dropping the old one.
         */
-        task_lock(tsk);
+        WARN_ON_ONCE(tsk->flags & PF_EXITING);
        oldcg = tsk->cgroups;
-        get_css_set(oldcg);
-        task_unlock(tsk);
        /* locate or allocate a new css_set for this task. */
        if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
                might_sleep();
                /* find_css_set will give us newcg already referenced. */
                newcg = find_css_set(oldcg, cgrp);
-                if (!newcg) {
+                if (!newcg)
-                        put_css_set(oldcg);
                        return -ENOMEM;
-                }
        }
-        put_css_set(oldcg);
-        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
        task_lock(tsk);
-        if (tsk->flags & PF_EXITING) {
-                task_unlock(tsk);
-                put_css_set(newcg);
-                return -ESRCH;
-        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 * @cgrp: the cgroup the task is attaching to
 * @tsk: the task to be attached
 *
- * Call holding cgroup_mutex. May take task_lock of
+ * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * the task 'tsk' during call.
+ * @tsk during call.
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct cgroupfs_root *root = cgrp->root;
+        struct cgroup_taskset tset = { };
+        /* @tsk either already exited or can't exit until the end */
+        if (tsk->flags & PF_EXITING)
+                return -ESRCH;
        /* Nothing to do if the task is already in that cgroup */
        oldcgrp = task_cgroup_from_root(tsk, root);
        if (cgrp == oldcgrp)
                return 0;
+        tset.single.task = tsk;
+        tset.single.cgrp = oldcgrp;
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, tsk);
+                        retval = ss->can_attach(ss, cgrp, &tset);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
-                if (ss->can_attach_task) {
-                        retval = ss->can_attach_task(cgrp, tsk);
-                        if (retval) {
-                                failed_ss = ss;
-                                goto out;
-                        }
-                }
        }
        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                goto out;
        for_each_subsys(root, ss) {
-                if (ss->pre_attach)
-                        ss->pre_attach(cgrp);
-                if (ss->attach_task)
-                        ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, tsk);
+                        ss->attach(ss, cgrp, &tset);
        }
        synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, tsk);
+                                ss->cancel_attach(ss, cgrp, &tset);
                }
        }
        return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
        read_lock(&css_set_lock);
        newcg = find_existing_css_set(cg, cgrp, template);
-        if (newcg)
-                get_css_set(newcg);
        read_unlock(&css_set_lock);
        /* doesn't exist at all? */
        if (!newcg)
                return false;
        /* see if it's already in the list */
-        list_for_each_entry(cg_entry, newcg_list, links) {
+        list_for_each_entry(cg_entry, newcg_list, links)
-                if (cg_entry->cg == newcg) {
+                if (cg_entry->cg == newcg)
-                        put_css_set(newcg);
                        return true;
-                }
-        }
        /* not found */
-        put_css_set(newcg);
        return false;
 }
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
 * @cgrp: the cgroup to attach to
 * @leader: the threadgroup leader task_struct of the group to be attached
 *
- * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * take task_lock of each thread in leader's threadgroup individually in turn.
+ * task_lock of each thread in leader's threadgroup individually in turn.
 */
-int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 {
        int retval, i, group_size;
        struct cgroup_subsys *ss, *failed_ss = NULL;
-        bool cancel_failed_ss = false;
        /* guaranteed to be initialized later, but the compiler needs this */
-        struct cgroup *oldcgrp = NULL;
        struct css_set *oldcg;
        struct cgroupfs_root *root = cgrp->root;
        /* threadgroup list cursor and array */
        struct task_struct *tsk;
+        struct task_and_cgroup *tc;
        struct flex_array *group;
+        struct cgroup_taskset tset = { };
        /*
         * we need to make sure we have css_sets for all the tasks we're
         * going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         * step 0: in order to do expensive, possibly blocking operations for
         * every thread, we cannot iterate the thread group list, since it needs
         * rcu or tasklist locked. instead, build an array of all threads in the
-         * group - threadgroup_fork_lock prevents new threads from appearing,
+         * group - group_rwsem prevents new threads from appearing, and if
-         * and if threads exit, this will just be an over-estimate.
+         * threads exit, this will just be an over-estimate.
         */
        group_size = get_nr_threads(leader);
        /* flex_array supports very large thread-groups better than kmalloc. */
-        group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+        group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
-                                 GFP_KERNEL);
        if (!group)
                return -ENOMEM;
        /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2027,7 +2103,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                goto out_free_group_list;
        /* prevent changes to the threadgroup list while we take a snapshot. */
-        rcu_read_lock();
+        read_lock(&tasklist_lock);
        if (!thread_group_leader(leader)) {
                /*
                 * a race with de_thread from another thread's exec() may strip
@@ -2036,53 +2112,57 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                 * throw this task away and try again (from cgroup_procs_write);
                 * this is "double-double-toil-and-trouble-check locking".
                 */
-                rcu_read_unlock();
+                read_unlock(&tasklist_lock);
                retval = -EAGAIN;
                goto out_free_group_list;
        }
-        /* take a reference on each task in the group to go in the array. */
        tsk = leader;
        i = 0;
        do {
+                struct task_and_cgroup ent;
+                /* @tsk either already exited or can't exit until the end */
+                if (tsk->flags & PF_EXITING)
+                        continue;
                /* as per above, nr_threads may decrease, but not increase. */
                BUG_ON(i >= group_size);
-                get_task_struct(tsk);
                /*
                 * saying GFP_ATOMIC has no effect here because we did prealloc
                 * earlier, but it's good form to communicate our expectations.
                 */
-                retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+                ent.task = tsk;
+                ent.cgrp = task_cgroup_from_root(tsk, root);
+                /* nothing to do if this task is already in the cgroup */
+                if (ent.cgrp == cgrp)
+                        continue;
+                retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
                BUG_ON(retval != 0);
                i++;
        } while_each_thread(leader, tsk);
        /* remember the number of threads in the array for later. */
        group_size = i;
-        rcu_read_unlock();
+        tset.tc_array = group;
+        tset.tc_array_len = group_size;
+        read_unlock(&tasklist_lock);
+        /* methods shouldn't be called if no task is actually migrating */
+        retval = 0;
+        if (!group_size)
+                goto out_free_group_list;
        /*
         * step 1: check that we can legitimately attach to the cgroup.
         */
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, leader);
+                        retval = ss->can_attach(ss, cgrp, &tset);
                        if (retval) {
                                failed_ss = ss;
                                goto out_cancel_attach;
                        }
                }
-                /* a callback to be run on every thread in the threadgroup. */
-                if (ss->can_attach_task) {
-                        /* run on each task in the threadgroup. */
-                        for (i = 0; i < group_size; i++) {
-                                tsk = flex_array_get_ptr(group, i);
-                                retval = ss->can_attach_task(cgrp, tsk);
-                                if (retval) {
-                                        failed_ss = ss;
-                                        cancel_failed_ss = true;
-                                        goto out_cancel_attach;
-                                }
-                        }
-                }
        }
        /*
@@ -2091,69 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        INIT_LIST_HEAD(&newcg_list);
        for (i = 0; i < group_size; i++) {
-                tsk = flex_array_get_ptr(group, i);
+                tc = flex_array_get(group, i);
-                /* nothing to do if this task is already in the cgroup */
+                oldcg = tc->task->cgroups;
-                oldcgrp = task_cgroup_from_root(tsk, root);
-                if (cgrp == oldcgrp)
+                /* if we don't already have it in the list get a new one */
-                        continue;
+                if (!css_set_check_fetched(cgrp, tc->task, oldcg,
-                /* get old css_set pointer */
+                                           &newcg_list)) {
-                task_lock(tsk);
-                if (tsk->flags & PF_EXITING) {
-                        /* ignore this task if it's going away */
-                        task_unlock(tsk);
-                        continue;
-                }
-                oldcg = tsk->cgroups;
-                get_css_set(oldcg);
-                task_unlock(tsk);
-                /* see if the new one for us is already in the list? */
-                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
-                        /* was already there, nothing to do. */
-                        put_css_set(oldcg);
-                } else {
-                        /* we don't already have it. get new one. */
                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
-                        put_css_set(oldcg);
                        if (retval)
                                goto out_list_teardown;
                }
        }
        /*
-         * step 3: now that we're guaranteed success wrt the css_sets, proceed
+         * step 3: now that we're guaranteed success wrt the css_sets,
-         * to move all tasks to the new cgroup, calling ss->attach_task for each
+         * proceed to move all tasks to the new cgroup.  There are no
-         * one along the way. there are no failure cases after here, so this is
+         * failure cases after here, so this is the commit point.
-         * the commit point.
         */
-        for_each_subsys(root, ss) {
-                if (ss->pre_attach)
-                        ss->pre_attach(cgrp);
-        }
        for (i = 0; i < group_size; i++) {
-                tsk = flex_array_get_ptr(group, i);
+                tc = flex_array_get(group, i);
-                /* leave current thread as it is if it's already there */
+                retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
-                oldcgrp = task_cgroup_from_root(tsk, root);
+                BUG_ON(retval);
-                if (cgrp == oldcgrp)
-                        continue;
-                /* attach each task to each subsystem */
-                for_each_subsys(root, ss) {
-                        if (ss->attach_task)
-                                ss->attach_task(cgrp, tsk);
-                }
-                /* if the thread is PF_EXITING, it can just get skipped. */
-                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
-                BUG_ON(retval != 0 && retval != -ESRCH);
        }
        /* nothing is sensitive to fork() after this point. */
        /*
-         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * step 4: do subsystem attach callbacks.
-         * TODO: if ever a subsystem needs to know the oldcgrp for each task
-         * being moved, this call will need to be reworked to communicate that.
         */
        for_each_subsys(root, ss) {
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, leader);
+                        ss->attach(ss, cgrp, &tset);
        }
        /*
@@ -2173,20 +2220,12 @@ out_cancel_attach:
        /* same deal as in cgroup_attach_task */
        if (retval) {
                for_each_subsys(root, ss) {
-                        if (ss == failed_ss) {
+                        if (ss == failed_ss)
-                                if (cancel_failed_ss && ss->cancel_attach)
-                                        ss->cancel_attach(ss, cgrp, leader);
                                break;
-                        }
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, leader);
+                                ss->cancel_attach(ss, cgrp, &tset);
                }
        }
-        /* clean up the array of referenced threads in the group. */
-        for (i = 0; i < group_size; i++) {
-                tsk = flex_array_get_ptr(group, i);
-                put_task_struct(tsk);
-        }
 out_free_group_list:
        flex_array_free(group);
        return retval;
@@ -2194,8 +2233,8 @@ out_free_group_list:
 /*
 * Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will take
+ * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex; may take task_lock of task.
+ * cgroup_mutex and threadgroup; may take task_lock of task.
 */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
@@ -2222,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
                         * detect it later.
                         */
                        tsk = tsk->group_leader;
-                } else if (tsk->flags & PF_EXITING) {
-                        /* optimization for the single-task-only case */
-                        rcu_read_unlock();
-                        cgroup_unlock();
-                        return -ESRCH;
                }
                /*
                 * even if we're attaching all tasks in the thread group, we
                 * only need to check permissions on one of them.
@@ -2251,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
                get_task_struct(tsk);
        }
-        if (threadgroup) {
+        threadgroup_lock(tsk);
-                threadgroup_fork_write_lock(tsk);
+        if (threadgroup)
                ret = cgroup_attach_proc(cgrp, tsk);
-                threadgroup_fork_write_unlock(tsk);
+        else
-        } else {
                ret = cgroup_attach_task(cgrp, tsk);
-        }
+        threadgroup_unlock(tsk);
        put_task_struct(tsk);
        cgroup_unlock();
        return ret;
@@ -2308,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                return -EINVAL;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
+        mutex_lock(&cgroup_root_mutex);
        strcpy(cgrp->root->release_agent_path, buffer);
+        mutex_unlock(&cgroup_root_mutex);
        cgroup_unlock();
        return 0;
 }
@@ -2587,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)
        return __d_cft(file->f_dentry);
 }
-static int cgroup_create_file(struct dentry *dentry, mode_t mode,
+static int cgroup_create_file(struct dentry *dentry, umode_t mode,
                                struct super_block *sb)
 {
        struct inode *inode;
@@ -2628,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 * @mode: mode to set on new directory.
 */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-                                mode_t mode)
+                                umode_t mode)
 {
        struct dentry *parent;
        int error = 0;
@@ -2655,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 * returns S_IRUGO if it has only a read handler
 * returns S_IWUSR if it has only a write hander
 */
-static mode_t cgroup_file_mode(const struct cftype *cft)
+static umode_t cgroup_file_mode(const struct cftype *cft)
 {
-        mode_t mode = 0;
+        umode_t mode = 0;
        if (cft->mode)
                return cft->mode;
@@ -2680,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,
        struct dentry *dir = cgrp->dentry;
        struct dentry *dentry;
        int error;
-        mode_t mode;
+        umode_t mode;
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -2791,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
 }
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+        __acquires(css_set_lock)
 {
        /*
         * The first time anyone tries to iterate across a cgroup,
@@ -2830,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 }
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+        __releases(css_set_lock)
 {
        read_unlock(&css_set_lock);
 }
@@ -3754,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
 * Must be called with the mutex on the parent inode held
 */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-                             mode_t mode)
+                             umode_t mode)
 {
        struct cgroup *cgrp;
        struct cgroupfs_root *root = parent->root;
@@ -3848,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        return err;
 }
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
        struct cgroup *c_parent = dentry->d_parent->d_fsdata;
@@ -4014,11 +4053,11 @@ again:
        finish_wait(&cgroup_rmdir_waitq, &wait);
        clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-        spin_lock(&release_list_lock);
+        raw_spin_lock(&release_list_lock);
        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
-        spin_unlock(&release_list_lock);
+        raw_spin_unlock(&release_list_lock);
        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
@@ -4493,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
 *
 * A pointer to the shared css_set was automatically copied in
 * fork.c by dup_task_struct().  However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
+ * it was not made under the protection of RCU, cgroup_mutex or
- * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
+ * threadgroup_change_begin(), so it might no longer be a valid
- * have already changed current->cgroups, allowing the previously
+ * cgroup pointer.  cgroup_attach_task() might have already changed
- * referenced cgroup group to be removed and freed.
+ * current->cgroups, allowing the previously referenced cgroup
+ * group to be removed and freed.
+ *
+ * Outside the pointer validity we also need to process the css_set
+ * inheritance between threadgoup_change_begin() and
+ * threadgoup_change_end(), this way there is no leak in any process
+ * wide migration performed by cgroup_attach_proc() that could otherwise
+ * miss a thread because it is too early or too late in the fork stage.
 *
 * At the point that cgroup_fork() is called, 'current' is the parent
 * task, and the passed argument 'child' points to the child task.
 */
 void cgroup_fork(struct task_struct *child)
 {
-        task_lock(current);
+        /*
+         * We don't need to task_lock() current because current->cgroups
+         * can't be changed concurrently here. The parent obviously hasn't
+         * exited and called cgroup_exit(), and we are synchronized against
+         * cgroup migration through threadgroup_change_begin().
+         */
        child->cgroups = current->cgroups;
        get_css_set(child->cgroups);
-        task_unlock(current);
        INIT_LIST_HEAD(&child->cg_list);
 }
@@ -4548,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
 {
        if (use_task_css_set_links) {
                write_lock(&css_set_lock);
-                task_lock(child);
+                if (list_empty(&child->cg_list)) {
-                if (list_empty(&child->cg_list))
+                        /*
+                         * It's safe to use child->cgroups without task_lock()
+                         * here because we are protected through
+                         * threadgroup_change_begin() against concurrent
+                         * css_set change in cgroup_task_migrate(). Also
+                         * the task can't exit at that point until
+                         * wake_up_new_task() is called, so we are protected
+                         * against cgroup_exit() setting child->cgroup to
+                         * init_css_set.
+                         */
                        list_add(&child->cg_list, &child->cgroups->tasks);
-                task_unlock(child);
+                }
                write_unlock(&css_set_lock);
        }
 }
@@ -4671,13 +4730,13 @@ static void check_for_release(struct cgroup *cgrp)
                 * already queued for a userspace notification, queue
                 * it now */
                int need_schedule_work = 0;
-                spin_lock(&release_list_lock);
+                raw_spin_lock(&release_list_lock);
                if (!cgroup_is_removed(cgrp) &&
                    list_empty(&cgrp->release_list)) {
                        list_add(&cgrp->release_list, &release_list);
                        need_schedule_work = 1;
                }
-                spin_unlock(&release_list_lock);
+                raw_spin_unlock(&release_list_lock);
                if (need_schedule_work)
                        schedule_work(&release_agent_work);
        }
@@ -4729,7 +4788,7 @@ static void cgroup_release_agent(struct work_struct *work)
 {
        BUG_ON(work != &release_agent_work);
        mutex_lock(&cgroup_mutex);
-        spin_lock(&release_list_lock);
+        raw_spin_lock(&release_list_lock);
        while (!list_empty(&release_list)) {
                char *argv[3], *envp[3];
                int i;
@@ -4738,7 +4797,7 @@ static void cgroup_release_agent(struct work_struct *work)
                                                    struct cgroup,
                                                    release_list);
                list_del_init(&cgrp->release_list);
-                spin_unlock(&release_list_lock);
+                raw_spin_unlock(&release_list_lock);
                pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
                if (!pathbuf)
                        goto continue_free;
@@ -4768,9 +4827,9 @@ static void cgroup_release_agent(struct work_struct *work)
 continue_free:
                kfree(pathbuf);
                kfree(agentbuf);
-                spin_lock(&release_list_lock);
+                raw_spin_lock(&release_list_lock);
        }
-        spin_unlock(&release_list_lock);
+        raw_spin_unlock(&release_list_lock);
        mutex_unlock(&cgroup_mutex);
 }
@@ -4880,9 +4939,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        rcu_assign_pointer(id->css, NULL);
        rcu_assign_pointer(css->id, NULL);
-        spin_lock(&ss->id_lock);
+        write_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
-        spin_unlock(&ss->id_lock);
+        write_unlock(&ss->id_lock);
        kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4908,10 +4967,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
                error = -ENOMEM;
                goto err_out;
        }
-        spin_lock(&ss->id_lock);
+        write_lock(&ss->id_lock);
        /* Don't use 0. allocates an ID of 1-65535 */
        error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-        spin_unlock(&ss->id_lock);
+        write_unlock(&ss->id_lock);
        /* Returns error when there are no free spaces for new ID.*/
        if (error) {
@@ -4926,9 +4985,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
        return newid;
 remove_idr:
        error = -ENOSPC;
-        spin_lock(&ss->id_lock);
+        write_lock(&ss->id_lock);
        idr_remove(&ss->idr, myid);
-        spin_unlock(&ss->id_lock);
+        write_unlock(&ss->id_lock);
 err_out:
        kfree(newid);
        return ERR_PTR(error);
@@ -4940,7 +4999,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
        struct css_id *newid;
-        spin_lock_init(&ss->id_lock);
+        rwlock_init(&ss->id_lock);
        idr_init(&ss->idr);
        newid = get_new_cssid(ss, 0);
@@ -5035,9 +5094,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
                 * scan next entry from bitmap(tree), tmpid is updated after
                 * idr_get_next().
                 */
-                spin_lock(&ss->id_lock);
+                read_lock(&ss->id_lock);
                tmp = idr_get_next(&ss->idr, &tmpid);
-                spin_unlock(&ss->id_lock);
+                read_unlock(&ss->id_lock);
                if (!tmp)
                        break;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818d7e45..fc0646b78a64 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,7 +14,7 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
+bool cgroup_freezing(struct task_struct *task)
 {
-        enum freezer_state state = task_freezer(task)->state;
+        enum freezer_state state;
-        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+        bool ret;
-}
-int cgroup_freezing_or_frozen(struct task_struct *task)
+        rcu_read_lock();
-{
+        state = task_freezer(task)->state;
-        int result;
+        ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
-        task_lock(task);
+        rcu_read_unlock();
-        result = __cgroup_freezing_or_frozen(task);
-        task_unlock(task);
+        return ret;
-        return result;
 }
 /*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
 * freezer_can_attach():
 * cgroup_mutex (held by caller of can_attach)
 *
- * cgroup_freezing_or_frozen():
- * task->alloc_lock (to get task's cgroup)
- *
 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
 * freezer->lock
 *  sighand->siglock (if the cgroup is freezing)
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;
 *   write_lock css_set_lock (cgroup iterator start)
 *    task->alloc_lock
 *   read_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
+ *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
 *     sighand->siglock
 */
 static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -150,7 +145,18 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
 static void freezer_destroy(struct cgroup_subsys *ss,
                            struct cgroup *cgroup)
 {
-        kfree(cgroup_freezer(cgroup));
+        struct freezer *freezer = cgroup_freezer(cgroup);
+        if (freezer->state != CGROUP_THAWED)
+                atomic_dec(&system_freezing_cnt);
+        kfree(freezer);
+}
+/* task is frozen or will freeze immediately when next it gets woken */
+static bool is_task_frozen_enough(struct task_struct *task)
+{
+        return frozen(task) ||
+                (task_is_stopped_or_traced(task) && freezing(task));
 }
 /*
@@ -160,13 +166,17 @@ static void freezer_destroy(struct cgroup_subsys *ss,
 */
 static int freezer_can_attach(struct cgroup_subsys *ss,
                              struct cgroup *new_cgroup,
-                              struct task_struct *task)
+                              struct cgroup_taskset *tset)
 {
        struct freezer *freezer;
+        struct task_struct *task;
        /*
         * Anything frozen can't move or be moved to/from.
         */
+        cgroup_taskset_for_each(task, new_cgroup, tset)
+                if (cgroup_freezing(task))
+                        return -EBUSY;
        freezer = cgroup_freezer(new_cgroup);
        if (freezer->state != CGROUP_THAWED)
@@ -175,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        return 0;
 }
-static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-        rcu_read_lock();
-        if (__cgroup_freezing_or_frozen(tsk)) {
-                rcu_read_unlock();
-                return -EBUSY;
-        }
-        rcu_read_unlock();
-        return 0;
-}
 static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
 {
        struct freezer *freezer;
@@ -213,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
        /* Locking avoids race with FREEZING -> THAWED transitions. */
        if (freezer->state == CGROUP_FREEZING)
-                freeze_task(task, true);
+                freeze_task(task);
        spin_unlock_irq(&freezer->lock);
 }
@@ -231,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
                ntotal++;
-                if (frozen(task))
+                if (freezing(task) && is_task_frozen_enough(task))
                        nfrozen++;
        }
@@ -279,12 +278,11 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        struct task_struct *task;
        unsigned int num_cant_freeze_now = 0;
-        freezer->state = CGROUP_FREEZING;
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
-                if (!freeze_task(task, true))
+                if (!freeze_task(task))
                        continue;
-                if (frozen(task))
+                if (is_task_frozen_enough(task))
                        continue;
                if (!freezing(task) && !freezer_should_skip(task))
                        num_cant_freeze_now++;
@@ -300,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        struct task_struct *task;
        cgroup_iter_start(cgroup, &it);
-        while ((task = cgroup_iter_next(cgroup, &it))) {
+        while ((task = cgroup_iter_next(cgroup, &it)))
-                thaw_process(task);
+                __thaw_task(task);
-        }
        cgroup_iter_end(cgroup, &it);
-        freezer->state = CGROUP_THAWED;
 }
 static int freezer_change_state(struct cgroup *cgroup,
@@ -319,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,
        spin_lock_irq(&freezer->lock);
        update_if_frozen(cgroup, freezer);
-        if (goal_state == freezer->state)
-                goto out;
        switch (goal_state) {
        case CGROUP_THAWED:
+                if (freezer->state != CGROUP_THAWED)
+                        atomic_dec(&system_freezing_cnt);
+                freezer->state = CGROUP_THAWED;
                unfreeze_cgroup(cgroup, freezer);
                break;
        case CGROUP_FROZEN:
+                if (freezer->state == CGROUP_THAWED)
+                        atomic_inc(&system_freezing_cnt);
+                freezer->state = CGROUP_FREEZING;
                retval = try_to_freeze_cgroup(cgroup, freezer);
                break;
        default:
                BUG();
        }
-out:
        spin_unlock_irq(&freezer->lock);
        return retval;
@@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
-        .can_attach_task = freezer_can_attach_task,
-        .pre_attach     = NULL,
-        .attach_task    = NULL,
-        .attach         = NULL,
        .fork           = freezer_fork,
-        .exit           = NULL,
 };
diff --git a/kernel/compat.c b/kernel/compat.c
index e2435ee9993a..f346cedfe24d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
+#include <linux/export.h>
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
 #include <linux/times.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b1..2060c6e57027 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,11 +10,12 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
+#include <linux/suspend.h>
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -177,8 +178,7 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-                    (!cputime_eq(p->utime, cputime_zero) ||
+                    (p->utime || p->stime))
-                     !cputime_eq(p->stime, cputime_zero)))
                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
                                "(state = %ld, flags = %x)\n",
                                p->comm, task_pid_nr(p), cpu,
@@ -379,6 +379,7 @@ out:
        cpu_maps_update_done();
        return err;
 }
+EXPORT_SYMBOL_GPL(cpu_up);
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
@@ -469,13 +470,86 @@ out:
        cpu_maps_update_done();
 }
-static int alloc_frozen_cpus(void)
+static int __init alloc_frozen_cpus(void)
 {
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
        return 0;
 }
 core_initcall(alloc_frozen_cpus);
+/*
+ * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
+ * hotplug when tasks are about to be frozen. Also, don't allow the freezer
+ * to continue until any currently running CPU hotplug operation gets
+ * completed.
+ * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
+ * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
+ * CPU hotplug path and released only after it is complete. Thus, we
+ * (and hence the freezer) will block here until any currently running CPU
+ * hotplug operation gets completed.
+ */
+void cpu_hotplug_disable_before_freeze(void)
+{
+        cpu_maps_update_begin();
+        cpu_hotplug_disabled = 1;
+        cpu_maps_update_done();
+}
+/*
+ * When tasks have been thawed, re-enable regular CPU hotplug (which had been
+ * disabled while beginning to freeze tasks).
+ */
+void cpu_hotplug_enable_after_thaw(void)
+{
+        cpu_maps_update_begin();
+        cpu_hotplug_disabled = 0;
+        cpu_maps_update_done();
+}
+/*
+ * When callbacks for CPU hotplug notifications are being executed, we must
+ * ensure that the state of the system with respect to the tasks being frozen
+ * or not, as reported by the notification, remains unchanged *throughout the
+ * duration* of the execution of the callbacks.
+ * Hence we need to prevent the freezer from racing with regular CPU hotplug.
+ *
+ * This synchronization is implemented by mutually excluding regular CPU
+ * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
+ * Hibernate notifications.
+ */
+static int
+cpu_hotplug_pm_callback(struct notifier_block *nb,
+                        unsigned long action, void *ptr)
+{
+        switch (action) {
+        case PM_SUSPEND_PREPARE:
+        case PM_HIBERNATION_PREPARE:
+                cpu_hotplug_disable_before_freeze();
+                break;
+        case PM_POST_SUSPEND:
+        case PM_POST_HIBERNATION:
+                cpu_hotplug_enable_after_thaw();
+                break;
+        default:
+                return NOTIFY_DONE;
+        }
+        return NOTIFY_OK;
+}
+static int __init cpu_hotplug_pm_sync_init(void)
+{
+        pm_notifier(cpu_hotplug_pm_callback, 0);
+        return 0;
+}
+core_initcall(cpu_hotplug_pm_sync_init);
 #endif /* CONFIG_PM_SLEEP_SMP */
 /**
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 000000000000..249152e15308
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author:
+ *      Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/cpu_pm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/syscore_ops.h>
+static DEFINE_RWLOCK(cpu_pm_notifier_lock);
+static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+{
+        int ret;
+        ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+                nr_to_call, nr_calls);
+        return notifier_to_errno(ret);
+}
+/**
+ * cpu_pm_register_notifier - register a driver with cpu_pm
+ * @nb: notifier block to register
+ *
+ * Add a driver to a list of drivers that are notified about
+ * CPU and CPU cluster low power entry and exit.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_register.
+ */
+int cpu_pm_register_notifier(struct notifier_block *nb)
+{
+        unsigned long flags;
+        int ret;
+        write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+        ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+        write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
+/**
+ * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
+ * @nb: notifier block to be unregistered
+ *
+ * Remove a driver from the CPU PM notifier list.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_unregister.
+ */
+int cpu_pm_unregister_notifier(struct notifier_block *nb)
+{
+        unsigned long flags;
+        int ret;
+        write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+        ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+        write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
+/**
+ * cpm_pm_enter - CPU low power entry notifier
+ *
+ * Notifies listeners that a single CPU is entering a low power state that may
+ * cause some blocks in the same power domain as the cpu to reset.
+ *
+ * Must be called on the affected CPU with interrupts disabled.  Platform is
+ * responsible for ensuring that cpu_pm_enter is not called twice on the same
+ * CPU before cpu_pm_exit is called. Notified drivers can include VFP
+ * co-processor, interrupt controller and it's PM extensions, local CPU
+ * timers context save/restore which shouldn't be interrupted. Hence it
+ * must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_enter(void)
+{
+        int nr_calls;
+        int ret = 0;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+        if (ret)
+                /*
+                 * Inform listeners (nr_calls - 1) about failure of CPU PM
+                 * PM entry who are notified earlier to prepare for it.
+                 */
+                cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_enter);
+/**
+ * cpm_pm_exit - CPU low power exit notifier
+ *
+ * Notifies listeners that a single CPU is exiting a low power state that may
+ * have caused some blocks in the same power domain as the cpu to reset.
+ *
+ * Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_exit(void)
+{
+        int ret;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_exit);
+/**
+ * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ *
+ * Notifies listeners that all cpus in a power domain are entering a low power
+ * state that may cause some blocks in the same power domain to reset.
+ *
+ * Must be called after cpu_pm_enter has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_enter(void)
+{
+        int nr_calls;
+        int ret = 0;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+        if (ret)
+                /*
+                 * Inform listeners (nr_calls - 1) about failure of CPU cluster
+                 * PM entry who are notified earlier to prepare for it.
+                 */
+                cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
+/**
+ * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ *
+ * Notifies listeners that all cpus in a power domain are exiting form a
+ * low power state that may have caused some blocks in the same power domain
+ * to reset.
+ *
+ * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_exit(void)
+{
+        int ret;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
+#ifdef CONFIG_PM
+static int cpu_pm_suspend(void)
+{
+        int ret;
+        ret = cpu_pm_enter();
+        if (ret)
+                return ret;
+        ret = cpu_cluster_pm_enter();
+        return ret;
+}
+static void cpu_pm_resume(void)
+{
+        cpu_cluster_pm_exit();
+        cpu_pm_exit();
+}
+static struct syscore_ops cpu_pm_syscore_ops = {
+        .suspend = cpu_pm_suspend,
+        .resume = cpu_pm_resume,
+};
+static int cpu_pm_init(void)
+{
+        register_syscore_ops(&cpu_pm_syscore_ops);
+        return 0;
+}
+core_initcall(cpu_pm_init);
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10131fdaff70..a09ac2b9a661 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/memory.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
                            struct cpuset, css);
 }
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+        return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+        return false;
+}
+#endif
 /* bits in struct cpuset flags field */
 typedef enum {
        CS_CPU_EXCLUSIVE,
@@ -949,6 +962,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
 {
+        bool need_loop;
 repeat:
        /*
         * Allow tasks that have access to memory reserves because they have
@@ -960,10 +975,17 @@ repeat:
                return;
        task_lock(tsk);
+        /*
+         * Determine if a loop is necessary if another thread is doing
+         * get_mems_allowed().  If at least one node remains unchanged and
+         * tsk does not have a mempolicy, then an empty nodemask will not be
+         * possible when mems_allowed is larger than a word.
+         */
+        need_loop = task_has_mempolicy(tsk) ||
+                        !nodes_intersects(*newmems, tsk->mems_allowed);
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
        /*
         * ensure checking ->mems_allowed_change_disable after setting all new
         * allowed nodes.
@@ -982,7 +1004,7 @@ repeat:
         * Allocation of memory is very fast, we needn't sleep when waiting
         * for the read-side.
         */
-        while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+        while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
                task_unlock(tsk);
                if (!task_curr(tsk))
                        yield();
@@ -1367,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                             struct task_struct *tsk)
-{
-        struct cpuset *cs = cgroup_cs(cont);
-        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-                return -ENOSPC;
-        /*
-         * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
-         * cannot change their cpu affinity and isolating such threads by their
-         * set of allowed nodes is unnecessary.  Thus, cpusets are not
-         * applicable for such threads.  This prevents checking for success of
-         * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
-         * be changed.
-         */
-        if (tsk->flags & PF_THREAD_BOUND)
-                return -EINVAL;
-        return 0;
-}
-static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-{
-        return security_task_setscheduler(task);
-}
 /*
 * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in pre_attach, and they must
+ * dynamically allocating them is not allowed in can_attach, and they must
- * persist among pre_attach, attach_task, and attach.
+ * persist until attach.
 */
 static cpumask_var_t cpus_attach;
 static nodemask_t cpuset_attach_nodemask_from;
 static nodemask_t cpuset_attach_nodemask_to;
-/* Set-up work for before attaching each task. */
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static void cpuset_pre_attach(struct cgroup *cont)
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                             struct cgroup_taskset *tset)
 {
-        struct cpuset *cs = cgroup_cs(cont);
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct task_struct *task;
+        int ret;
+        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+                return -ENOSPC;
+        cgroup_taskset_for_each(task, cgrp, tset) {
+                /*
+                 * Kthreads bound to specific cpus cannot be moved to a new
+                 * cpuset; we cannot change their cpu affinity and
+                 * isolating such threads by their set of allowed nodes is
+                 * unnecessary.  Thus, cpusets are not applicable for such
+                 * threads.  This prevents checking for success of
+                 * set_cpus_allowed_ptr() on all attached tasks before
+                 * cpus_allowed may be changed.
+                 */
+                if (task->flags & PF_THREAD_BOUND)
+                        return -EINVAL;
+                if ((ret = security_task_setscheduler(task)))
+                        return ret;
+        }
+        /* prepare for attach */
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
                guarantee_online_cpus(cs, cpus_attach);
        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
-}
-/* Per-thread attachment work. */
+        return 0;
-static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
-{
-        int err;
-        struct cpuset *cs = cgroup_cs(cont);
-        /*
-         * can_attach beforehand should guarantee that this doesn't fail.
-         * TODO: have a better way to handle failure here
-         */
-        err = set_cpus_allowed_ptr(tsk, cpus_attach);
-        WARN_ON_ONCE(err);
-        cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
-        cpuset_update_task_spread_flag(cs, tsk);
 }
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                          struct cgroup *oldcont, struct task_struct *tsk)
+                          struct cgroup_taskset *tset)
 {
        struct mm_struct *mm;
-        struct cpuset *cs = cgroup_cs(cont);
+        struct task_struct *task;
-        struct cpuset *oldcs = cgroup_cs(oldcont);
+        struct task_struct *leader = cgroup_taskset_first(tset);
+        struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *oldcs = cgroup_cs(oldcgrp);
+        cgroup_taskset_for_each(task, cgrp, tset) {
+                /*
+                 * can_attach beforehand should guarantee that this doesn't
+                 * fail.  TODO: have a better way to handle failure here
+                 */
+                WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+                cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+                cpuset_update_task_spread_flag(cs, task);
+        }
        /*
         * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1447,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
         */
        cpuset_attach_nodemask_from = oldcs->mems_allowed;
        cpuset_attach_nodemask_to = cs->mems_allowed;
-        mm = get_task_mm(tsk);
+        mm = get_task_mm(leader);
        if (mm) {
                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                if (is_memory_migrate(cs))
@@ -1903,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
        .create = cpuset_create,
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
-        .can_attach_task = cpuset_can_attach_task,
-        .pre_attach = cpuset_pre_attach,
-        .attach_task = cpuset_attach_task,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5f85690285d4..c766ee54c0b1 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -2,7 +2,7 @@
 #include <linux/crash_dump.h>
 #include <linux/init.h>
 #include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/export.h>
 /*
 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -20,8 +20,15 @@ unsigned long saved_max_pfn;
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 /*
+ * stores the size of elf header of crash image
+ */
+unsigned long long elfcorehdr_size;
+/*
 * elfcorehdr= specifies the location of elf core header stored by the crashed
 * kernel. This option will be passed by kexec loader to the capture kernel.
+ *
+ * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
 */
 static int __init setup_elfcorehdr(char *arg)
 {
@@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg)
        if (!arg)
                return -EINVAL;
        elfcorehdr_addr = memparse(arg, &end);
+        if (*end == '@') {
+                elfcorehdr_size = elfcorehdr_addr;
+                elfcorehdr_addr = memparse(end + 1, &end);
+        }
        return end > arg ? 0 : -EINVAL;
 }
 early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 8ef31f53c44c..5791612a4045 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -8,7 +8,7 @@
 * as published by the Free Software Foundation; either version
 * 2 of the Licence, or (at your option) any later version.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -644,6 +644,9 @@ void __init cred_init(void)
 */
 struct cred *prepare_kernel_cred(struct task_struct *daemon)
 {
+#ifdef CONFIG_KEYS
+        struct thread_group_cred *tgcred;
+#endif
        const struct cred *old;
        struct cred *new;
@@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        if (!new)
                return NULL;
+#ifdef CONFIG_KEYS
+        tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+        if (!tgcred) {
+                kmem_cache_free(cred_jar, new);
+                return NULL;
+        }
+#endif
        kdebug("prepare_kernel_cred() alloc %p", new);
        if (daemon)
@@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        get_group_info(new->group_info);
 #ifdef CONFIG_KEYS
-        atomic_inc(&init_tgcred.usage);
+        atomic_set(&tgcred->usage, 1);
-        new->tgcred = &init_tgcred;
+        spin_lock_init(&tgcred->lock);
+        tgcred->process_keyring = NULL;
+        tgcred->session_keyring = NULL;
+        new->tgcred = tgcred;
        new->request_key_auth = NULL;
        new->thread_keyring = NULL;
        new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 34872482315e..c22d8c28ad84 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len)
                /* Pack in hex chars */
                for (i = 0; i < wcount; i++)
-                        bufptr = pack_hex_byte(bufptr, s[i]);
+                        bufptr = hex_byte_pack(bufptr, s[i]);
                *bufptr = '\0';
                /* Move up */
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
        if (err)
                return NULL;
        while (count > 0) {
-                buf = pack_hex_byte(buf, *tmp);
+                buf = hex_byte_pack(buf, *tmp);
                tmp++;
                count--;
        }
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id)
        limit = id + (BUF_THREAD_ID_SIZE / 2);
        while (id < limit) {
                if (!lzero || *id != 0) {
-                        pkt = pack_hex_byte(pkt, *id);
+                        pkt = hex_byte_pack(pkt, *id);
                        lzero = 0;
                }
                id++;
        }
        if (lzero)
-                pkt = pack_hex_byte(pkt, 0);
+                pkt = hex_byte_pack(pkt, 0);
        return pkt;
 }
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
        dbg_remove_all_break();
        remcom_out_buffer[0] = 'S';
-        pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+        hex_byte_pack(&remcom_out_buffer[1], ks->signo);
 }
 static void gdb_get_regs_helper(struct kgdb_state *ks)
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
                /* Reply to host that an exception has occurred */
                ptr = remcom_out_buffer;
                *ptr++ = 'T';
-                ptr = pack_hex_byte(ptr, ks->signo);
+                ptr = hex_byte_pack(ptr, ks->signo);
                ptr += strlen(strcpy(ptr, "thread:"));
                int_to_threadref(thref, shadow_pid(current->pid));
                ptr = pack_threadid(ptr, thref);
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index d9ca9aa481ec..8b68ce78ff17 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -11,6 +11,7 @@
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/kdebug.h>
+#include <linux/export.h>
 #include "kdb_private.h"
 #include "../debug_core.h"
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
                kdb_printf("%-20s%8u  0x%p ", mod->name,
                           mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-                kdb_printf("%4d ", module_refcount(mod));
+                kdb_printf("%4ld ", module_refcount(mod));
 #endif
                if (mod->state == MODULE_STATE_GOING)
                        kdb_printf(" (Unloading)");
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
                (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
                (p->exit_state & EXIT_DEAD) ? 'E' :
                (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
-        if (p->pid == 0) {
+        if (is_idle_task(p)) {
                /* Idle task.  Is it really idle, apart from the kdb
                 * interrupt? */
                if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/dma.c b/kernel/dma.c
index f903189c5304..68a2306522c8 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
 *   [It also happened to remove the sizeof(char *) == sizeof(int)
 *   assumption introduced because of those /proc/dma patches. -- Hennus]
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/spinlock.h>
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c36..22d901f9caf4 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = -pg
 endif
-obj-y := core.o ring_buffer.o
+obj-y := core.o ring_buffer.o callchain.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000000..057e24b665cf
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,191 @@
+/*
+ * Performance events callchain code, extracted from core.c:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "internal.h"
+struct callchain_cpus_entries {
+        struct rcu_head                 rcu_head;
+        struct perf_callchain_entry     *cpu_entries[0];
+};
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+static struct callchain_cpus_entries *callchain_cpus_entries;
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
+{
+}
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                                struct pt_regs *regs)
+{
+}
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+        struct callchain_cpus_entries *entries;
+        int cpu;
+        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+}
+static void release_callchain_buffers(void)
+{
+        struct callchain_cpus_entries *entries;
+        entries = callchain_cpus_entries;
+        rcu_assign_pointer(callchain_cpus_entries, NULL);
+        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+static int alloc_callchain_buffers(void)
+{
+        int cpu;
+        int size;
+        struct callchain_cpus_entries *entries;
+        /*
+         * We can't use the percpu allocation API for data that can be
+         * accessed from NMI. Use a temporary manual per cpu allocation
+         * until that gets sorted out.
+         */
+        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+        entries = kzalloc(size, GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+        for_each_possible_cpu(cpu) {
+                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                         cpu_to_node(cpu));
+                if (!entries->cpu_entries[cpu])
+                        goto fail;
+        }
+        rcu_assign_pointer(callchain_cpus_entries, entries);
+        return 0;
+fail:
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+        return -ENOMEM;
+}
+int get_callchain_buffers(void)
+{
+        int err = 0;
+        int count;
+        mutex_lock(&callchain_mutex);
+        count = atomic_inc_return(&nr_callchain_events);
+        if (WARN_ON_ONCE(count < 1)) {
+                err = -EINVAL;
+                goto exit;
+        }
+        if (count > 1) {
+                /* If the allocation failed, give up */
+                if (!callchain_cpus_entries)
+                        err = -ENOMEM;
+                goto exit;
+        }
+        err = alloc_callchain_buffers();
+        if (err)
+                release_callchain_buffers();
+exit:
+        mutex_unlock(&callchain_mutex);
+        return err;
+}
+void put_callchain_buffers(void)
+{
+        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+                release_callchain_buffers();
+                mutex_unlock(&callchain_mutex);
+        }
+}
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+        int cpu;
+        struct callchain_cpus_entries *entries;
+        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+        if (*rctx == -1)
+                return NULL;
+        entries = rcu_dereference(callchain_cpus_entries);
+        if (!entries)
+                return NULL;
+        cpu = smp_processor_id();
+        return &entries->cpu_entries[cpu][*rctx];
+}
+static void
+put_callchain_entry(int rctx)
+{
+        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+        int rctx;
+        struct perf_callchain_entry *entry;
+        entry = get_callchain_entry(&rctx);
+        if (rctx == -1)
+                return NULL;
+        if (!entry)
+                goto exit_put;
+        entry->nr = 0;
+        if (!user_mode(regs)) {
+                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                perf_callchain_kernel(entry, regs);
+                if (current->mm)
+                        regs = task_pt_regs(current);
+                else
+                        regs = NULL;
+        }
+        if (regs) {
+                perf_callchain_store(entry, PERF_CONTEXT_USER);
+                perf_callchain_user(entry, regs);
+        }
+exit_put:
+        put_callchain_entry(rctx);
+        return entry;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0f857782d06f..a8f4ac001a00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
 */
@@ -25,6 +25,7 @@
 #include <linux/reboot.h>
 #include <linux/vmstat.h>
 #include <linux/device.h>
+#include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -127,7 +128,7 @@ enum event_type_t {
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */
-struct jump_label_key perf_sched_events __read_mostly;
+struct jump_label_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static atomic_t nr_mmap_events __read_mostly;
@@ -184,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
+static void ring_buffer_attach(struct perf_event *event,
+                               struct ring_buffer *rb);
 void __weak perf_event_print_debug(void)        { }
 extern __weak const char *perf_pmu_name(void)
@@ -1126,6 +1130,8 @@ event_sched_out(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
        ctx->nr_active--;
+        if (event->attr.freq && event->attr.sample_freq)
+                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
 }
@@ -1321,6 +1327,7 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_disable);
 static void perf_set_shadow_time(struct perf_event *event,
                                 struct perf_event_context *ctx,
@@ -1402,6 +1409,8 @@ event_sched_in(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
+        if (event->attr.freq && event->attr.sample_freq)
+                ctx->nr_freq++;
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
@@ -1658,8 +1667,7 @@ retry:
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
-static void __perf_event_mark_enabled(struct perf_event *event,
+static void __perf_event_mark_enabled(struct perf_event *event)
-                                        struct perf_event_context *ctx)
 {
        struct perf_event *sub;
        u64 tstamp = perf_event_time(event);
@@ -1697,7 +1705,7 @@ static int __perf_event_enable(void *info)
         */
        perf_cgroup_set_timestamp(current, ctx);
-        __perf_event_mark_enabled(event, ctx);
+        __perf_event_mark_enabled(event);
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
@@ -1778,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
 retry:
        if (!ctx->is_active) {
-                __perf_event_mark_enabled(event, ctx);
+                __perf_event_mark_enabled(event);
                goto out;
        }
@@ -1805,6 +1813,7 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_enable);
 int perf_event_refresh(struct perf_event *event, int refresh)
 {
@@ -2170,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         */
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-        perf_event_sched_in(cpuctx, ctx, task);
+        if (ctx->nr_events)
+                cpuctx->task_ctx = ctx;
-        cpuctx->task_ctx = ctx;
+        perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
        perf_pmu_enable(ctx->pmu);
        perf_ctx_unlock(cpuctx, ctx);
@@ -2322,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
        u64 interrupts, now;
        s64 delta;
+        if (!ctx->nr_freq)
+                return;
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
@@ -2377,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
        struct perf_event_context *ctx = NULL;
-        int rotate = 0, remove = 1;
+        int rotate = 0, remove = 1, freq = 0;
        if (cpuctx->ctx.nr_events) {
                remove = 0;
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                        rotate = 1;
+                if (cpuctx->ctx.nr_freq)
+                        freq = 1;
        }
        ctx = cpuctx->task_ctx;
@@ -2390,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
                remove = 0;
                if (ctx->nr_events != ctx->nr_active)
                        rotate = 1;
+                if (ctx->nr_freq)
+                        freq = 1;
        }
+        if (!rotate && !freq)
+                goto done;
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
-        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
-        if (ctx)
-                perf_ctx_adjust_freq(ctx, interval);
-        if (!rotate)
+        if (freq) {
-                goto done;
+                perf_ctx_adjust_freq(&cpuctx->ctx, interval);
+                if (ctx)
+                        perf_ctx_adjust_freq(ctx, interval);
+        }
-        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+        if (rotate) {
-        if (ctx)
+                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+                if (ctx)
+                        ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
-        rotate_ctx(&cpuctx->ctx);
+                rotate_ctx(&cpuctx->ctx);
-        if (ctx)
+                if (ctx)
-                rotate_ctx(ctx);
+                        rotate_ctx(ctx);
+                perf_event_sched_in(cpuctx, ctx, current);
+        }
-        perf_event_sched_in(cpuctx, ctx, current);
+        perf_pmu_enable(cpuctx->ctx.pmu);
+        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
-        perf_pmu_enable(cpuctx->ctx.pmu);
-        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 void perf_event_task_tick(void)
@@ -2443,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;
-        __perf_event_mark_enabled(event, ctx);
+        __perf_event_mark_enabled(event);
        return 1;
 }
@@ -2475,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        raw_spin_lock(&ctx->lock);
        task_ctx_sched_out(ctx);
-        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+        list_for_each_entry(event, &ctx->event_list, event_entry) {
-                ret = event_enable_on_exec(event, ctx);
-                if (ret)
-                        enabled = 1;
-        }
-        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
                ret = event_enable_on_exec(event, ctx);
                if (ret)
                        enabled = 1;
@@ -2569,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event)
 }
 /*
- * Callchain support
- */
-struct callchain_cpus_entries {
-        struct rcu_head                 rcu_head;
-        struct perf_callchain_entry     *cpu_entries[0];
-};
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
-static atomic_t nr_callchain_events;
-static DEFINE_MUTEX(callchain_mutex);
-struct callchain_cpus_entries *callchain_cpus_entries;
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                                  struct pt_regs *regs)
-{
-}
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
-                                struct pt_regs *regs)
-{
-}
-static void release_callchain_buffers_rcu(struct rcu_head *head)
-{
-        struct callchain_cpus_entries *entries;
-        int cpu;
-        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
-        for_each_possible_cpu(cpu)
-                kfree(entries->cpu_entries[cpu]);
-        kfree(entries);
-}
-static void release_callchain_buffers(void)
-{
-        struct callchain_cpus_entries *entries;
-        entries = callchain_cpus_entries;
-        rcu_assign_pointer(callchain_cpus_entries, NULL);
-        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
-}
-static int alloc_callchain_buffers(void)
-{
-        int cpu;
-        int size;
-        struct callchain_cpus_entries *entries;
-        /*
-         * We can't use the percpu allocation API for data that can be
-         * accessed from NMI. Use a temporary manual per cpu allocation
-         * until that gets sorted out.
-         */
-        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
-        entries = kzalloc(size, GFP_KERNEL);
-        if (!entries)
-                return -ENOMEM;
-        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
-        for_each_possible_cpu(cpu) {
-                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
-                                                         cpu_to_node(cpu));
-                if (!entries->cpu_entries[cpu])
-                        goto fail;
-        }
-        rcu_assign_pointer(callchain_cpus_entries, entries);
-        return 0;
-fail:
-        for_each_possible_cpu(cpu)
-                kfree(entries->cpu_entries[cpu]);
-        kfree(entries);
-        return -ENOMEM;
-}
-static int get_callchain_buffers(void)
-{
-        int err = 0;
-        int count;
-        mutex_lock(&callchain_mutex);
-        count = atomic_inc_return(&nr_callchain_events);
-        if (WARN_ON_ONCE(count < 1)) {
-                err = -EINVAL;
-                goto exit;
-        }
-        if (count > 1) {
-                /* If the allocation failed, give up */
-                if (!callchain_cpus_entries)
-                        err = -ENOMEM;
-                goto exit;
-        }
-        err = alloc_callchain_buffers();
-        if (err)
-                release_callchain_buffers();
-exit:
-        mutex_unlock(&callchain_mutex);
-        return err;
-}
-static void put_callchain_buffers(void)
-{
-        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
-                release_callchain_buffers();
-                mutex_unlock(&callchain_mutex);
-        }
-}
-static int get_recursion_context(int *recursion)
-{
-        int rctx;
-        if (in_nmi())
-                rctx = 3;
-        else if (in_irq())
-                rctx = 2;
-        else if (in_softirq())
-                rctx = 1;
-        else
-                rctx = 0;
-        if (recursion[rctx])
-                return -1;
-        recursion[rctx]++;
-        barrier();
-        return rctx;
-}
-static inline void put_recursion_context(int *recursion, int rctx)
-{
-        barrier();
-        recursion[rctx]--;
-}
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
-{
-        int cpu;
-        struct callchain_cpus_entries *entries;
-        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
-        if (*rctx == -1)
-                return NULL;
-        entries = rcu_dereference(callchain_cpus_entries);
-        if (!entries)
-                return NULL;
-        cpu = smp_processor_id();
-        return &entries->cpu_entries[cpu][*rctx];
-}
-static void
-put_callchain_entry(int rctx)
-{
-        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
-}
-static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-        int rctx;
-        struct perf_callchain_entry *entry;
-        entry = get_callchain_entry(&rctx);
-        if (rctx == -1)
-                return NULL;
-        if (!entry)
-                goto exit_put;
-        entry->nr = 0;
-        if (!user_mode(regs)) {
-                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-                perf_callchain_kernel(entry, regs);
-                if (current->mm)
-                        regs = task_pt_regs(current);
-                else
-                        regs = NULL;
-        }
-        if (regs) {
-                perf_callchain_store(entry, PERF_CONTEXT_USER);
-                perf_callchain_user(entry, regs);
-        }
-exit_put:
-        put_callchain_entry(rctx);
-        return entry;
-}
-/*
 * Initialize the perf_event context in a task_struct:
 */
 static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2941,7 +2748,7 @@ static void free_event(struct perf_event *event)
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_dec(&perf_sched_events);
+                        jump_label_dec_deferred(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2952,7 +2759,7 @@ static void free_event(struct perf_event *event)
                        put_callchain_buffers();
                if (is_cgroup_event(event)) {
                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                        jump_label_dec(&perf_sched_events);
+                        jump_label_dec_deferred(&perf_sched_events);
                }
        }
@@ -3189,12 +2996,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        struct ring_buffer *rb;
        unsigned int events = POLL_HUP;
+        /*
+         * Race between perf_event_set_output() and perf_poll(): perf_poll()
+         * grabs the rb reference but perf_event_set_output() overrides it.
+         * Here is the timeline for two threads T1, T2:
+         * t0: T1, rb = rcu_dereference(event->rb)
+         * t1: T2, old_rb = event->rb
+         * t2: T2, event->rb = new rb
+         * t3: T2, ring_buffer_detach(old_rb)
+         * t4: T1, ring_buffer_attach(rb1)
+         * t5: T1, poll_wait(event->waitq)
+         *
+         * To avoid this problem, we grab mmap_mutex in perf_poll()
+         * thereby ensuring that the assignment of the new ring buffer
+         * and the detachment of the old buffer appear atomic to perf_poll()
+         */
+        mutex_lock(&event->mmap_mutex);
        rcu_read_lock();
        rb = rcu_dereference(event->rb);
-        if (rb)
+        if (rb) {
+                ring_buffer_attach(event, rb);
                events = atomic_xchg(&rb->poll, 0);
+        }
        rcu_read_unlock();
+        mutex_unlock(&event->mmap_mutex);
        poll_wait(file, &event->waitq, wait);
        return events;
@@ -3495,6 +3323,53 @@ unlock:
        return ret;
 }
+static void ring_buffer_attach(struct perf_event *event,
+                               struct ring_buffer *rb)
+{
+        unsigned long flags;
+        if (!list_empty(&event->rb_entry))
+                return;
+        spin_lock_irqsave(&rb->event_lock, flags);
+        if (!list_empty(&event->rb_entry))
+                goto unlock;
+        list_add(&event->rb_entry, &rb->event_list);
+unlock:
+        spin_unlock_irqrestore(&rb->event_lock, flags);
+}
+static void ring_buffer_detach(struct perf_event *event,
+                               struct ring_buffer *rb)
+{
+        unsigned long flags;
+        if (list_empty(&event->rb_entry))
+                return;
+        spin_lock_irqsave(&rb->event_lock, flags);
+        list_del_init(&event->rb_entry);
+        wake_up_all(&event->waitq);
+        spin_unlock_irqrestore(&rb->event_lock, flags);
+}
+static void ring_buffer_wakeup(struct perf_event *event)
+{
+        struct ring_buffer *rb;
+        rcu_read_lock();
+        rb = rcu_dereference(event->rb);
+        if (!rb)
+                goto unlock;
+        list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+                wake_up_all(&event->waitq);
+unlock:
+        rcu_read_unlock();
+}
 static void rb_free_rcu(struct rcu_head *rcu_head)
 {
        struct ring_buffer *rb;
@@ -3520,9 +3395,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 static void ring_buffer_put(struct ring_buffer *rb)
 {
+        struct perf_event *event, *n;
+        unsigned long flags;
        if (!atomic_dec_and_test(&rb->refcount))
                return;
+        spin_lock_irqsave(&rb->event_lock, flags);
+        list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
+                list_del_init(&event->rb_entry);
+                wake_up_all(&event->waitq);
+        }
+        spin_unlock_irqrestore(&rb->event_lock, flags);
        call_rcu(&rb->rcu_head, rb_free_rcu);
 }
@@ -3543,8 +3428,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
                struct ring_buffer *rb = event->rb;
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-                vma->vm_mm->locked_vm -= event->mmap_locked;
+                vma->vm_mm->pinned_vm -= event->mmap_locked;
                rcu_assign_pointer(event->rb, NULL);
+                ring_buffer_detach(event, rb);
                mutex_unlock(&event->mmap_mutex);
                ring_buffer_put(rb);
@@ -3624,7 +3510,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
-        locked = vma->vm_mm->locked_vm + extra;
+        locked = vma->vm_mm->pinned_vm + extra;
        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
                !capable(CAP_IPC_LOCK)) {
@@ -3650,7 +3536,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        atomic_long_add(user_extra, &user->locked_vm);
        event->mmap_locked = extra;
        event->mmap_user = get_current_user();
-        vma->vm_mm->locked_vm += event->mmap_locked;
+        vma->vm_mm->pinned_vm += event->mmap_locked;
 unlock:
        if (!ret)
@@ -3699,7 +3585,7 @@ static const struct file_operations perf_fops = {
 void perf_event_wakeup(struct perf_event *event)
 {
-        wake_up_all(&event->waitq);
+        ring_buffer_wakeup(event);
        if (event->pending_kill) {
                kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -4736,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
-        data->period = event->hw.last_period;
        if (!overflow)
                overflow = perf_swevent_set_period(event);
@@ -4770,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!is_sampling_event(event))
                return;
+        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+                data->period = nr;
+                return perf_swevent_overflow(event, 1, data, regs);
+        } else
+                data->period = event->hw.last_period;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);
@@ -5282,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        regs = get_irq_regs();
        if (regs && !perf_exclude_event(event, regs)) {
-                if (!(event->attr.exclude_idle && current->pid == 0))
+                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
@@ -5758,6 +5649,7 @@ struct pmu *perf_init_event(struct perf_event *event)
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
        if (pmu) {
+                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (ret)
                        pmu = ERR_PTR(ret);
@@ -5765,6 +5657,7 @@ struct pmu *perf_init_event(struct perf_event *event)
        }
        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (!ret)
                        goto unlock;
@@ -5819,6 +5712,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        INIT_LIST_HEAD(&event->group_entry);
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
+        INIT_LIST_HEAD(&event->rb_entry);
        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending, perf_pending_event);
@@ -5891,11 +5786,9 @@ done:
                return ERR_PTR(err);
        }
-        event->pmu = pmu;
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_inc(&perf_sched_events);
+                        jump_label_inc(&perf_sched_events.key);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -6027,6 +5920,8 @@ set:
        old_rb = event->rb;
        rcu_assign_pointer(event->rb, rb);
+        if (old_rb)
+                ring_buffer_detach(event, old_rb);
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
@@ -6131,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * - that may need work on context switch
                 */
                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-                jump_label_inc(&perf_sched_events);
+                jump_label_inc(&perf_sched_events.key);
        }
        /*
@@ -6977,6 +6872,9 @@ void __init perf_event_init(void)
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+        /* do not patch jump label more than once per second */
+        jump_label_rate_limit(&perf_sched_events, HZ);
 }
 static int __init perf_event_sysfs_init(void)
@@ -7043,10 +6941,13 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
+                               struct cgroup_taskset *tset)
 {
-        task_function_call(task, __perf_cgroup_move, task);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, cgrp, tset)
+                task_function_call(task, __perf_cgroup_move, task);
 }
 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -7060,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
        if (!(task->flags & PF_EXITING))
                return;
-        perf_cgroup_attach_task(cgrp, task);
+        task_function_call(task, __perf_cgroup_move, task);
 }
 struct cgroup_subsys perf_subsys = {
@@ -7069,6 +6970,6 @@ struct cgroup_subsys perf_subsys = {
        .create         = perf_cgroup_create,
        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
-        .attach_task    = perf_cgroup_attach_task,
+        .attach         = perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09097dd8116c..b0b107f90afc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
 #ifndef _KERNEL_EVENTS_INTERNAL_H
 #define _KERNEL_EVENTS_INTERNAL_H
+#include <linux/hardirq.h>
+/* Buffer handling */
 #define RING_BUFFER_WRITABLE            0x01
 struct ring_buffer {
@@ -22,6 +26,9 @@ struct ring_buffer {
        local_t                         lost;           /* nr records lost   */
        long                            watermark;      /* wakeup watermark  */
+        /* poll crap */
+        spinlock_t                      event_lock;
+        struct list_head                event_list;
        struct perf_event_mmap_page     *user_page;
        void                            *data_pages[0];
@@ -64,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
 }
 #endif
-static unsigned long perf_data_size(struct ring_buffer *rb)
+static inline unsigned long perf_data_size(struct ring_buffer *rb)
 {
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
@@ -93,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
        } while (len);
 }
+/* Callchain handling */
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
+static inline int get_recursion_context(int *recursion)
+{
+        int rctx;
+        if (in_nmi())
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (recursion[rctx])
+                return -1;
+        recursion[rctx]++;
+        barrier();
+        return rctx;
+}
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+        barrier();
+        recursion[rctx]--;
+}
 #endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a2a29205cc0f..6ddaba43fb7a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *  Copyright  ��  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
 */
@@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
                rb->writable = 1;
        atomic_set(&rb->refcount, 1);
+        INIT_LIST_HEAD(&rb->event_list);
+        spin_lock_init(&rb->event_lock);
 }
 #ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/exit.c b/kernel/exit.c
index 2913b3509d42..c44738267be7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
+#include <linux/writeback.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime = cputime_add(sig->utime, tsk->utime);
+                sig->utime += tsk->utime;
-                sig->stime = cputime_add(sig->stime, tsk->stime);
+                sig->stime += tsk->stime;
-                sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+                sig->gtime += tsk->gtime;
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -679,10 +680,6 @@ static void exit_mm(struct task_struct * tsk)
        tsk->mm = NULL;
        up_read(&mm->mmap_sem);
        enter_lazy_tlb(mm, current);
-        /* We don't want this task to be frozen prematurely */
-        clear_freeze_flag(tsk);
-        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                atomic_dec(&mm->oom_disable_count);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
@@ -890,7 +887,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
@@ -1039,9 +1036,12 @@ NORET_TYPE void do_exit(long code)
        validate_creds_for_do_exit(tsk);
        preempt_disable();
+        if (tsk->nr_dirtied)
+                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        /* causes final put_task_struct in finish_task_switch(). */
        tsk->state = TASK_DEAD;
+        tsk->flags |= PF_NOFREEZE;      /* tell freezer to ignore us */
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code)
 EXPORT_SYMBOL_GPL(do_exit);
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
 {
        if (comp)
                complete(comp);
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
-NORET_TYPE void
+void
 do_group_exit(int exit_code)
 {
        struct signal_struct *sig = current->signal;
@@ -1257,19 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
-                psig->cutime =
+                psig->cutime += tgutime + sig->cutime;
-                        cputime_add(psig->cutime,
+                psig->cstime += tgstime + sig->cstime;
-                        cputime_add(tgutime,
+                psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
-                                    sig->cutime));
-                psig->cstime =
-                        cputime_add(psig->cstime,
-                        cputime_add(tgstime,
-                                    sig->cstime));
-                psig->cgtime =
-                        cputime_add(psig->cgtime,
-                        cputime_add(p->gtime,
-                        cputime_add(sig->gtime,
-                                    sig->cgtime)));
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
@@ -1542,8 +1532,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
        }
        /* dead body doesn't have much to contribute */
-        if (p->exit_state == EXIT_DEAD)
+        if (unlikely(p->exit_state == EXIT_DEAD)) {
+                /*
+                 * But do not ignore this task until the tracer does
+                 * wait_task_zombie()->do_notify_parent().
+                 */
+                if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+                        wo->notask_error = 0;
                return 0;
+        }
        /* slay zombie? */
        if (p->exit_state == EXIT_ZOMBIE) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e6b6f4fb272..443f5125f11e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
 #include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
 /*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
@@ -162,7 +165,6 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
-        prop_local_destroy_single(&tsk->dirties);
        account_kernel_stack(tsk->stack, -1);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
@@ -274,10 +276,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack = ti;
-        err = prop_local_init_single(&tsk->dirties);
-        if (err)
-                goto out;
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
@@ -501,7 +499,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
-        atomic_set(&mm->oom_disable_count, 0);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -816,8 +813,6 @@ good_mm:
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
-        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                atomic_inc(&mm->oom_disable_count);
        tsk->mm = mm;
        tsk->active_mm = mm;
@@ -980,7 +975,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sched_autogroup_fork(sig);
 #ifdef CONFIG_CGROUPS
-        init_rwsem(&sig->threadgroup_fork_lock);
+        init_rwsem(&sig->group_rwsem);
 #endif
        sig->oom_adj = current->signal->oom_adj;
@@ -1000,7 +995,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
        new_flags |= PF_FORKNOEXEC;
        new_flags |= PF_STARTING;
        p->flags = new_flags;
-        clear_freeze_flag(p);
 }
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1031,8 +1025,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
-        tsk->cputime_expires.prof_exp = cputime_zero;
+        tsk->cputime_expires.prof_exp = 0;
-        tsk->cputime_expires.virt_exp = cputime_zero;
+        tsk->cputime_expires.virt_exp = 0;
        tsk->cputime_expires.sched_exp = 0;
        INIT_LIST_HEAD(&tsk->cpu_timers[0]);
        INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1140,14 +1134,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        init_sigpending(&p->pending);
-        p->utime = cputime_zero;
+        p->utime = p->stime = p->gtime = 0;
-        p->stime = cputime_zero;
+        p->utimescaled = p->stimescaled = 0;
-        p->gtime = cputime_zero;
-        p->utimescaled = cputime_zero;
-        p->stimescaled = cputime_zero;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        p->prev_utime = cputime_zero;
+        p->prev_utime = p->prev_stime = 0;
-        p->prev_stime = cputime_zero;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1166,7 +1156,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->io_context = NULL;
        p->audit_context = NULL;
        if (clone_flags & CLONE_THREAD)
-                threadgroup_fork_read_lock(current);
+                threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1302,6 +1292,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->pdeath_signal = 0;
        p->exit_state = 0;
+        p->nr_dirtied = 0;
+        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+        p->dirty_paused_when = 0;
        /*
         * Ok, make it visible to the rest of the system.
         * We dont wake it up yet.
@@ -1378,8 +1372,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        proc_fork_connector(p);
        cgroup_post_fork(p);
        if (clone_flags & CLONE_THREAD)
-                threadgroup_fork_read_unlock(current);
+                threadgroup_change_end(current);
        perf_event_fork(p);
+        trace_task_newtask(p, clone_flags);
        return p;
 bad_fork_free_pid:
@@ -1391,13 +1388,8 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-        if (p->mm) {
+        if (p->mm)
-                task_lock(p);
-                if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                        atomic_dec(&p->mm->oom_disable_count);
-                task_unlock(p);
                mmput(p->mm);
-        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -1418,7 +1410,7 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cgroup:
 #endif
        if (clone_flags & CLONE_THREAD)
-                threadgroup_fork_read_unlock(current);
+                threadgroup_change_end(current);
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7b01de98bb6a..9815b8d1eed5 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,104 +6,117 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
+#include <linux/kthread.h>
-/*
+/* total number of freezing conditions in effect */
- * freezing is complete, mark current process as frozen
+atomic_t system_freezing_cnt = ATOMIC_INIT(0);
+EXPORT_SYMBOL(system_freezing_cnt);
+/* indicate whether PM freezing is in effect, protected by pm_mutex */
+bool pm_freezing;
+bool pm_nosig_freezing;
+/* protects freezing and frozen transitions */
+static DEFINE_SPINLOCK(freezer_lock);
+/**
+ * freezing_slow_path - slow path for testing whether a task needs to be frozen
+ * @p: task to be tested
+ *
+ * This function is called by freezing() if system_freezing_cnt isn't zero
+ * and tests whether @p needs to enter and stay in frozen state.  Can be
+ * called under any context.  The freezers are responsible for ensuring the
+ * target tasks see the updated state.
 */
-static inline void frozen_process(void)
+bool freezing_slow_path(struct task_struct *p)
 {
-        if (!unlikely(current->flags & PF_NOFREEZE)) {
+        if (p->flags & PF_NOFREEZE)
-                current->flags |= PF_FROZEN;
+                return false;
-                smp_wmb();
-        }
+        if (pm_nosig_freezing || cgroup_freezing(p))
-        clear_freeze_flag(current);
+                return true;
+        if (pm_freezing && !(p->flags & PF_KTHREAD))
+                return true;
+        return false;
 }
+EXPORT_SYMBOL(freezing_slow_path);
 /* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
+bool __refrigerator(bool check_kthr_stop)
 {
        /* Hmm, should we be allowed to suspend when there are realtime
           processes around? */
-        long save;
+        bool was_frozen = false;
+        long save = current->state;
-        task_lock(current);
-        if (freezing(current)) {
-                frozen_process();
-                task_unlock(current);
-        } else {
-                task_unlock(current);
-                return;
-        }
-        save = current->state;
        pr_debug("%s entered refrigerator\n", current->comm);
-        spin_lock_irq(&current->sighand->siglock);
-        recalc_sigpending(); /* We sent fake signal, clean it up */
-        spin_unlock_irq(&current->sighand->siglock);
-        /* prevent accounting of that task to load */
-        current->flags |= PF_FREEZING;
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
-                if (!frozen(current))
+                spin_lock_irq(&freezer_lock);
+                current->flags |= PF_FROZEN;
+                if (!freezing(current) ||
+                    (check_kthr_stop && kthread_should_stop()))
+                        current->flags &= ~PF_FROZEN;
+                spin_unlock_irq(&freezer_lock);
+                if (!(current->flags & PF_FROZEN))
                        break;
+                was_frozen = true;
                schedule();
        }
-        /* Remove the accounting blocker */
-        current->flags &= ~PF_FREEZING;
        pr_debug("%s left refrigerator\n", current->comm);
-        __set_current_state(save);
+        /*
+         * Restore saved task state before returning.  The mb'd version
+         * needs to be used; otherwise, it might silently break
+         * synchronization which depends on ordered task state change.
+         */
+        set_current_state(save);
+        return was_frozen;
 }
-EXPORT_SYMBOL(refrigerator);
+EXPORT_SYMBOL(__refrigerator);
 static void fake_signal_wake_up(struct task_struct *p)
 {
        unsigned long flags;
-        spin_lock_irqsave(&p->sighand->siglock, flags);
+        if (lock_task_sighand(p, &flags)) {
-        signal_wake_up(p, 0);
+                signal_wake_up(p, 0);
-        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                unlock_task_sighand(p, &flags);
+        }
 }
 /**
- *      freeze_task - send a freeze request to given task
+ * freeze_task - send a freeze request to given task
- *      @p: task to send the request to
+ * @p: task to send the request to
- *      @sig_only: if set, the request will only be sent if the task has the
+ *
- *              PF_FREEZER_NOSIG flag unset
+ * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
- *      Return value: 'false', if @sig_only is set and the task has
+ * flag and either sending a fake signal to it or waking it up, depending
- *              PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ * on whether it has %PF_FREEZER_NOSIG set.
 *
- *      The freeze request is sent by setting the tasks's TIF_FREEZE flag and
+ * RETURNS:
- *      either sending a fake signal to it or waking it up, depending on whether
+ * %false, if @p is not freezing or already frozen; %true, otherwise
- *      or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
- *      has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- *      TIF_FREEZE flag will not be set.
 */
-bool freeze_task(struct task_struct *p, bool sig_only)
+bool freeze_task(struct task_struct *p)
 {
-        /*
+        unsigned long flags;
-         * We first check if the task is freezing and next if it has already
-         * been frozen to avoid the race with frozen_process() which first marks
+        spin_lock_irqsave(&freezer_lock, flags);
-         * the task as frozen and next clears its TIF_FREEZE.
+        if (!freezing(p) || frozen(p)) {
-         */
+                spin_unlock_irqrestore(&freezer_lock, flags);
-        if (!freezing(p)) {
+                return false;
-                smp_rmb();
-                if (frozen(p))
-                        return false;
-                if (!sig_only || should_send_signal(p))
-                        set_freeze_flag(p);
-                else
-                        return false;
        }
-        if (should_send_signal(p)) {
+        if (!(p->flags & PF_KTHREAD)) {
                fake_signal_wake_up(p);
                /*
                 * fake_signal_wake_up() goes through p's scheduler
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)
                 * TASK_RUNNING transition can't race with task state
                 * testing in try_to_freeze_tasks().
                 */
-        } else if (sig_only) {
-                return false;
        } else {
                wake_up_state(p, TASK_INTERRUPTIBLE);
        }
+        spin_unlock_irqrestore(&freezer_lock, flags);
        return true;
 }
-void cancel_freezing(struct task_struct *p)
+void __thaw_task(struct task_struct *p)
 {
        unsigned long flags;
-        if (freezing(p)) {
+        /*
-                pr_debug("  clean up: %s\n", p->comm);
+         * Clear freezing and kick @p if FROZEN.  Clearing is guaranteed to
-                clear_freeze_flag(p);
+         * be visible to @p as waking up implies wmb.  Waking up inside
-                spin_lock_irqsave(&p->sighand->siglock, flags);
+         * freezer_lock also prevents wakeups from leaking outside
-                recalc_sigpending_and_wake(p);
+         * refrigerator.
-                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+         */
-        }
+        spin_lock_irqsave(&freezer_lock, flags);
-}
+        if (frozen(p))
+                wake_up_process(p);
-static int __thaw_process(struct task_struct *p)
+        spin_unlock_irqrestore(&freezer_lock, flags);
-{
-        if (frozen(p)) {
-                p->flags &= ~PF_FROZEN;
-                return 1;
-        }
-        clear_freeze_flag(p);
-        return 0;
 }
-/*
+/**
- * Wake up a frozen process
+ * set_freezable - make %current freezable
 *
- * task_lock() is needed to prevent the race with refrigerator() which may
+ * Mark %current freezable and enter refrigerator if necessary.
- * occur if the freezing of tasks fails.  Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
 */
-int thaw_process(struct task_struct *p)
+bool set_freezable(void)
 {
-        task_lock(p);
+        might_sleep();
-        if (__thaw_process(p) == 1) {
-                task_unlock(p);
+        /*
-                wake_up_process(p);
+         * Modify flags while holding freezer_lock.  This ensures the
-                return 1;
+         * freezer notices that we aren't frozen yet or the freezing
-        }
+         * condition is visible to try_to_freeze() below.
-        task_unlock(p);
+         */
-        return 0;
+        spin_lock_irq(&freezer_lock);
+        current->flags &= ~PF_NOFREEZE;
+        spin_unlock_irq(&freezer_lock);
+        return try_to_freeze();
 }
-EXPORT_SYMBOL(thaw_process);
+EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/futex.c b/kernel/futex.c
index 11cbe052b2e8..1614be20173d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,7 +55,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
@@ -314,17 +314,29 @@ again:
 #endif
        lock_page(page_head);
+        /*
+         * If page_head->mapping is NULL, then it cannot be a PageAnon
+         * page; but it might be the ZERO_PAGE or in the gate area or
+         * in a special mapping (all cases which we are happy to fail);
+         * or it may have been a good file page when get_user_pages_fast
+         * found it, but truncated or holepunched or subjected to
+         * invalidate_complete_page2 before we got the page lock (also
+         * cases which we are happy to fail).  And we hold a reference,
+         * so refcount care in invalidate_complete_page's remove_mapping
+         * prevents drop_caches from setting mapping to NULL beneath us.
+         *
+         * The case we do have to guard against is when memory pressure made
+         * shmem_writepage move it from filecache to swapcache beneath us:
+         * an unlikely race, but we do need to retry for page_head->mapping.
+         */
        if (!page_head->mapping) {
+                int shmem_swizzled = PageSwapCache(page_head);
                unlock_page(page_head);
                put_page(page_head);
-                /*
+                if (shmem_swizzled)
-                * ZERO_PAGE pages don't have a mapping. Avoid a busy loop
+                        goto again;
-                * trying to find one. RW mapping would have COW'd (and thus
+                return -EFAULT;
-                * have a mapping) so this page is RO and won't ever change.
-                */
-                if ((page_head == ZERO_PAGE(address)))
-                        return -EFAULT;
-                goto again;
        }
        /*
@@ -854,7 +866,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 {
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
-        u32 curval, newval;
+        u32 uninitialized_var(curval), newval;
        if (!pi_state)
                return -EINVAL;
@@ -916,7 +928,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 {
-        u32 oldval;
+        u32 uninitialized_var(oldval);
        /*
         * There is no waiter, so we unlock the futex. The owner died
@@ -1576,7 +1588,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
        struct task_struct *oldowner = pi_state->owner;
-        u32 uval, curval, newval;
+        u32 uval, uninitialized_var(curval), newval;
        int ret;
        /* Owner died? */
@@ -1793,7 +1805,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 *
 * Returns:
 *  0 - uaddr contains val and hb has been locked
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
 */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2481,7 +2493,7 @@ err_unlock:
 */
 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-        u32 uval, nval, mval;
+        u32 uval, uninitialized_var(nval), mval;
 retry:
        if (get_user(uval, uaddr))
diff --git a/kernel/groups.c b/kernel/groups.c
index 1cc476d52dd3..99b53d1eb7ea 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -2,7 +2,7 @@
 * Supplementary group IDs
 */
 #include <linux/cred.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a9205e32a059..ae34bf51682b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,7 @@
 */
 #include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/notifier.h>
@@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
                             unsigned long newstate, int reprogram)
 {
+        struct timerqueue_node *next_timer;
        if (!(timer->state & HRTIMER_STATE_ENQUEUED))
                goto out;
-        if (&timer->node == timerqueue_getnext(&base->active)) {
+        next_timer = timerqueue_getnext(&base->active);
+        timerqueue_del(&base->active, &timer->node);
+        if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
                /* Reprogram the clock event device. if enabled */
                if (reprogram && hrtimer_hres_active()) {
@@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
                }
 #endif
        }
-        timerqueue_del(&base->active, &timer->node);
        if (!timerqueue_getnext(&base->active))
                base->cpu_base->active_bases &= ~(1 << base->index);
 out:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ea640120ab86..2e48ec0c2e91 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -13,7 +13,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/lockdep.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sysctl.h>
 /*
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
        /*
         * Ensure the task is not frozen.
-         * Also, when a freshly created task is scheduled once, changes
+         * Also, skip vfork and any other user process that freezer should skip.
-         * its state to TASK_UNINTERRUPTIBLE without having ever been
-         * switched out once, it musn't be checked.
         */
-        if (unlikely(t->flags & PF_FROZEN || !switch_count))
+        if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+            return;
+        /*
+         * When a freshly created task is scheduled once, changes its state to
+         * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
+         * musn't be checked.
+         */
+        if (unlikely(!switch_count))
                return;
        if (switch_count != t->last_switch_count) {
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc5114b4c16c..f7c543a801d9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -26,7 +26,7 @@
 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return -EINVAL;
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip);
 int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        int ret = 0;
        if (!desc)
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
 int irq_set_handler_data(unsigned int irq, void *data)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return -EINVAL;
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
@@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 int irq_set_chip_data(unsigned int irq, void *data)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return -EINVAL;
@@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc)
        }
 }
+void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
+{
+        if (desc->irq_data.chip->irq_enable)
+                desc->irq_data.chip->irq_enable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
+        cpumask_set_cpu(cpu, desc->percpu_enabled);
+}
+void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
+{
+        if (desc->irq_data.chip->irq_disable)
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
+        cpumask_clear_cpu(cpu, desc->percpu_enabled);
+}
 static inline void mask_ack_irq(struct irq_desc *desc)
 {
        if (desc->irq_data.chip->irq_mask_ack)
@@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
                chip->irq_eoi(&desc->irq_data);
 }
+/**
+ * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
+ * @irq:        the interrupt number
+ * @desc:       the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements. Same as
+ * handle_percpu_irq() above but with the following extras:
+ *
+ * action->percpu_dev_id is a pointer to percpu variables which
+ * contain the real device id for the cpu on which this handler is
+ * called
+ */
+void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+{
+        struct irq_chip *chip = irq_desc_get_chip(desc);
+        struct irqaction *action = desc->action;
+        void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+        irqreturn_t res;
+        kstat_incr_irqs_this_cpu(irq, desc);
+        if (chip->irq_ack)
+                chip->irq_ack(&desc->irq_data);
+        trace_irq_handler_entry(irq, action);
+        res = action->handler(irq, dev_id);
+        trace_irq_handler_exit(irq, action, res);
+        if (chip->irq_eoi)
+                chip->irq_eoi(&desc->irq_data);
+}
 void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                  const char *name)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
        if (!desc)
                return;
@@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index e38544dddb18..c89295a8f668 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -6,6 +6,7 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/syscore_ops.h>
@@ -211,6 +212,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
        }
        return gc;
 }
+EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
 /*
 * Separate lockdep class for interrupt chip which can nest irq_desc
@@ -258,6 +260,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
        }
        gc->irq_cnt = i - gc->irq_base;
 }
+EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
 /**
 * irq_setup_alt_chip - Switch to alternative chip
@@ -281,6 +284,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
        }
        return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
 /**
 * irq_remove_generic_chip - Remove a chip
@@ -311,6 +315,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
                irq_modify_status(i, clr, set);
        }
 }
+EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
 #ifdef CONFIG_PM
 static int irq_gc_suspend(void)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6546431447d7..b7952316016a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
 #define istate core_internal_state__do_not_mess_with_it
-extern int noirqdebug;
+extern bool noirqdebug;
 /*
 * Bits used by threaded handlers:
@@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
+extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
+extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
 extern void mask_irq(struct irq_desc *desc);
 extern void unmask_irq(struct irq_desc *desc);
@@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
+#define _IRQ_DESC_CHECK         (1 << 0)
+#define _IRQ_DESC_PERCPU        (1 << 1)
+#define IRQ_GET_DESC_CHECK_GLOBAL       (_IRQ_DESC_CHECK)
+#define IRQ_GET_DESC_CHECK_PERCPU       (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+                    unsigned int check);
 void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
 static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-        return __irq_get_desc_lock(irq, flags, true);
+        return __irq_get_desc_lock(irq, flags, true, check);
 }
 static inline void
@@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
 }
 static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-        return __irq_get_desc_lock(irq, flags, false);
+        return __irq_get_desc_lock(irq, flags, false, check);
 }
 static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 039b889ea053..d86e254b95eb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -9,7 +9,7 @@
 */
 #include <linux/irq.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/radix-tree.h>
@@ -424,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset)
 }
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+                    unsigned int check)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        if (desc) {
+                if (check & _IRQ_DESC_CHECK) {
+                        if ((check & _IRQ_DESC_PERCPU) &&
+                            !irq_settings_is_per_cpu_devid(desc))
+                                return NULL;
+                        if (!(check & _IRQ_DESC_PERCPU) &&
+                            irq_settings_is_per_cpu_devid(desc))
+                                return NULL;
+                }
                if (bus)
                        chip_bus_lock(desc);
                raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -443,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
                chip_bus_sync_unlock(desc);
 }
+int irq_set_percpu_devid(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (!desc)
+                return -EINVAL;
+        if (desc->percpu_enabled)
+                return -EINVAL;
+        desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
+        if (!desc->percpu_enabled)
+                return -ENOMEM;
+        irq_set_percpu_devid_flags(irq);
+        return 0;
+}
 /**
 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
 * @irq:        irq number to initialize
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index b57a3776de44..1f9e26526b69 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -20,15 +20,15 @@ static DEFINE_MUTEX(irq_domain_mutex);
 void irq_domain_add(struct irq_domain *domain)
 {
        struct irq_data *d;
-        int hwirq;
+        int hwirq, irq;
        /*
         * This assumes that the irq_domain owner has already allocated
         * the irq_descs.  This block will be removed when support for dynamic
         * allocation of irq_descs is added to irq_domain.
         */
-        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+        irq_domain_for_each_irq(domain, hwirq, irq) {
-                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                d = irq_get_irq_data(irq);
                if (!d) {
                        WARN(1, "error: assigning domain to non existant irq_desc");
                        return;
@@ -54,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain)
 void irq_domain_del(struct irq_domain *domain)
 {
        struct irq_data *d;
-        int hwirq;
+        int hwirq, irq;
        mutex_lock(&irq_domain_mutex);
        list_del(&domain->list);
        mutex_unlock(&irq_domain_mutex);
        /* Clear the irq_domain assignments */
-        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+        irq_domain_for_each_irq(domain, hwirq, irq) {
-                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                d = irq_get_irq_data(irq);
                d->domain = NULL;
        }
 }
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
                return -EINVAL;
        if (intsize < 1)
                return -EINVAL;
+        if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
+            (intspec[0] >= d->hwirq_base + d->nr_irq)))
+                return -EINVAL;
        *out_hwirq = intspec[0];
        *out_type = IRQ_TYPE_NONE;
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
        return 0;
 }
-struct irq_domain_ops irq_domain_simple_ops = {
-        .dt_translate = irq_domain_simple_dt_translate,
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 /**
 * irq_domain_create_simple() - Set up a 'simple' translation range
 */
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
 #endif /* CONFIG_OF_IRQ */
+struct irq_domain_ops irq_domain_simple_ops = {
+#ifdef CONFIG_OF_IRQ
+        .dt_translate = irq_domain_simple_dt_translate,
+#endif /* CONFIG_OF_IRQ */
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9b956fa20308..a9a9dbe49fea 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
@@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 static int __disable_irq_nosync(unsigned int irq)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
@@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 void enable_irq(unsigned int irq)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return;
@@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_to_desc(irq);
        int ret = -ENXIO;
+        if (irq_desc_get_chip(desc)->flags &  IRQCHIP_SKIP_SET_WAKE)
+                return 0;
        if (desc->irq_data.chip->irq_set_wake)
                ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
@@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        int ret = 0;
        if (!desc)
@@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        int canrequest = 0;
        if (!desc)
@@ -620,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
+        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
                if (test_and_clear_bit(IRQTF_RUNTHREAD,
                                       &action->thread_flags)) {
@@ -629,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
                        return 0;
                }
                schedule();
+                set_current_state(TASK_INTERRUPTIBLE);
        }
+        __set_current_state(TASK_RUNNING);
        return -1;
 }
@@ -1118,6 +1124,8 @@ int setup_irq(unsigned int irq, struct irqaction *act)
        int retval;
        struct irq_desc *desc = irq_to_desc(irq);
+        if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+                return -EINVAL;
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, act);
        chip_bus_sync_unlock(desc);
@@ -1126,7 +1134,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 }
 EXPORT_SYMBOL_GPL(setup_irq);
- /*
+/*
 * Internal function to unregister an irqaction - used to free
 * regular and special interrupts that are part of the architecture.
 */
@@ -1224,7 +1232,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 */
 void remove_irq(unsigned int irq, struct irqaction *act)
 {
-        __free_irq(irq, act->dev_id);
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+            __free_irq(irq, act->dev_id);
 }
 EXPORT_SYMBOL_GPL(remove_irq);
@@ -1246,7 +1257,7 @@ void free_irq(unsigned int irq, void *dev_id)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (!desc)
+        if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return;
 #ifdef CONFIG_SMP
@@ -1281,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);
 *      and to set up the interrupt handler in the right order.
 *
 *      If you want to set up a threaded irq handler for your device
- *      then you need to supply @handler and @thread_fn. @handler ist
+ *      then you need to supply @handler and @thread_fn. @handler is
 *      still called in hard interrupt context and has to check
 *      whether the interrupt originates from the device. If yes it
 *      needs to disable the interrupt on the device and return
@@ -1324,7 +1335,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        if (!desc)
                return -EINVAL;
-        if (!irq_settings_can_request(desc))
+        if (!irq_settings_can_request(desc) ||
+            WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return -EINVAL;
        if (!handler) {
@@ -1409,3 +1421,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
        return !ret ? IRQC_IS_HARDIRQ : ret;
 }
 EXPORT_SYMBOL_GPL(request_any_context_irq);
+void enable_percpu_irq(unsigned int irq, unsigned int type)
+{
+        unsigned int cpu = smp_processor_id();
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+        if (!desc)
+                return;
+        type &= IRQ_TYPE_SENSE_MASK;
+        if (type != IRQ_TYPE_NONE) {
+                int ret;
+                ret = __irq_set_trigger(desc, irq, type);
+                if (ret) {
+                        WARN(1, "failed to set type for IRQ%d\n", irq);
+                        goto out;
+                }
+        }
+        irq_percpu_enable(desc, cpu);
+out:
+        irq_put_desc_unlock(desc, flags);
+}
+void disable_percpu_irq(unsigned int irq)
+{
+        unsigned int cpu = smp_processor_id();
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+        if (!desc)
+                return;
+        irq_percpu_disable(desc, cpu);
+        irq_put_desc_unlock(desc, flags);
+}
+/*
+ * Internal function to unregister a percpu irqaction.
+ */
+static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        struct irqaction *action;
+        unsigned long flags;
+        WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
+        if (!desc)
+                return NULL;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        action = desc->action;
+        if (!action || action->percpu_dev_id != dev_id) {
+                WARN(1, "Trying to free already-free IRQ %d\n", irq);
+                goto bad;
+        }
+        if (!cpumask_empty(desc->percpu_enabled)) {
+                WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+                     irq, cpumask_first(desc->percpu_enabled));
+                goto bad;
+        }
+        /* Found it - now remove it from the list of entries: */
+        desc->action = NULL;
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        unregister_handler_proc(irq, action);
+        module_put(desc->owner);
+        return action;
+bad:
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        return NULL;
+}
+/**
+ *      remove_percpu_irq - free a per-cpu interrupt
+ *      @irq: Interrupt line to free
+ *      @act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (desc && irq_settings_is_per_cpu_devid(desc))
+            __free_percpu_irq(irq, act->percpu_dev_id);
+}
+/**
+ *      free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ *      @irq: Interrupt line to free
+ *      @dev_id: Device identity to free
+ *
+ *      Remove a percpu interrupt handler. The handler is removed, but
+ *      the interrupt line is not disabled. This must be done on each
+ *      CPU before calling this function. The function does not return
+ *      until any executing interrupts for this IRQ have completed.
+ *
+ *      This function must not be called from interrupt context.
+ */
+void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (!desc || !irq_settings_is_per_cpu_devid(desc))
+                return;
+        chip_bus_lock(desc);
+        kfree(__free_percpu_irq(irq, dev_id));
+        chip_bus_sync_unlock(desc);
+}
+/**
+ *      setup_percpu_irq - setup a per-cpu interrupt
+ *      @irq: Interrupt line to setup
+ *      @act: irqaction for the interrupt
+ *
+ * Used to statically setup per-cpu interrupts in the early boot process.
+ */
+int setup_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        int retval;
+        if (!desc || !irq_settings_is_per_cpu_devid(desc))
+                return -EINVAL;
+        chip_bus_lock(desc);
+        retval = __setup_irq(irq, desc, act);
+        chip_bus_sync_unlock(desc);
+        return retval;
+}
+/**
+ *      request_percpu_irq - allocate a percpu interrupt line
+ *      @irq: Interrupt line to allocate
+ *      @handler: Function to be called when the IRQ occurs.
+ *      @devname: An ascii name for the claiming device
+ *      @dev_id: A percpu cookie passed back to the handler function
+ *
+ *      This call allocates interrupt resources, but doesn't
+ *      automatically enable the interrupt. It has to be done on each
+ *      CPU using enable_percpu_irq().
+ *
+ *      Dev_id must be globally unique. It is a per-cpu variable, and
+ *      the handler gets called with the interrupted CPU's instance of
+ *      that variable.
+ */
+int request_percpu_irq(unsigned int irq, irq_handler_t handler,
+                       const char *devname, void __percpu *dev_id)
+{
+        struct irqaction *action;
+        struct irq_desc *desc;
+        int retval;
+        if (!dev_id)
+                return -EINVAL;
+        desc = irq_to_desc(irq);
+        if (!desc || !irq_settings_can_request(desc) ||
+            !irq_settings_is_per_cpu_devid(desc))
+                return -EINVAL;
+        action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+        if (!action)
+                return -ENOMEM;
+        action->handler = handler;
+        action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
+        action->name = devname;
+        action->percpu_dev_id = dev_id;
+        chip_bus_lock(desc);
+        retval = __setup_irq(irq, desc, action);
+        chip_bus_sync_unlock(desc);
+        if (retval)
+                kfree(action);
+        return retval;
+}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c9877..15e53b1766a6 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/syscore_ops.h>
 #include "internals.h"
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
-/**
+static void resume_irqs(bool want_early)
- * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
- *
- * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQS_SUSPENDED flag set.
- */
-void resume_device_irqs(void)
 {
        struct irq_desc *desc;
        int irq;
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
+                bool is_early = desc->action &&
+                        desc->action->flags & IRQF_EARLY_RESUME;
+                if (is_early != want_early)
+                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
                __enable_irq(desc, irq, true);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 }
+/**
+ * irq_pm_syscore_ops - enable interrupt lines early
+ *
+ * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
+ */
+static void irq_pm_syscore_resume(void)
+{
+        resume_irqs(true);
+}
+static struct syscore_ops irq_pm_syscore_ops = {
+        .resume         = irq_pm_syscore_resume,
+};
+static int __init irq_pm_init_ops(void)
+{
+        register_syscore_ops(&irq_pm_syscore_ops);
+        return 0;
+}
+device_initcall(irq_pm_init_ops);
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
+ * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
+ * set as well as those with %IRQF_FORCE_RESUME.
+ */
+void resume_device_irqs(void)
+{
+        resume_irqs(false);
+}
 EXPORT_SYMBOL_GPL(resume_device_irqs);
 /**
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index f1667833d444..1162f1030f18 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,6 +13,7 @@ enum {
        _IRQ_MOVE_PCNTXT        = IRQ_MOVE_PCNTXT,
        _IRQ_NO_BALANCING       = IRQ_NO_BALANCING,
        _IRQ_NESTED_THREAD      = IRQ_NESTED_THREAD,
+        _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
        _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
 };
@@ -24,6 +25,7 @@ enum {
 #define IRQ_NOTHREAD            GOT_YOU_MORON
 #define IRQ_NOAUTOEN            GOT_YOU_MORON
 #define IRQ_NESTED_THREAD       GOT_YOU_MORON
+#define IRQ_PER_CPU_DEVID       GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK        GOT_YOU_MORON
@@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
        return desc->status_use_accessors & _IRQ_PER_CPU;
 }
+static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
+}
 static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
 {
        desc->status_use_accessors |= _IRQ_PER_CPU;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index aa57d5da18c1..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
         */
        action = desc->action;
        if (!action || !(action->flags & IRQF_SHARED) ||
-            (action->flags & __IRQF_TIMER) || !action->next)
+            (action->flags & __IRQF_TIMER) ||
+            (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
+            !action->next)
                goto out;
        /* Already running on another processor */
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq)
        struct irq_desc *desc;
        int i, ok = 0;
-        if (atomic_inc_return(&irq_poll_active) == 1)
+        if (atomic_inc_return(&irq_poll_active) != 1)
                goto out;
        irq_poll_cpu = smp_processor_id();
@@ -323,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
        desc->irqs_unhandled = 0;
 }
-int noirqdebug __read_mostly;
+bool noirqdebug __read_mostly;
 int noirqdebug_setup(char *str)
 {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8aef..c3c46c72046e 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -6,9 +6,11 @@
 */
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/irq_work.h>
+#include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <asm/processor.h>
 /*
 * An entry can be in one of four states:
@@ -17,54 +19,34 @@
 * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
 * pending   next, 3 -> {busy}          : queued, pending callback
 * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- *
- * We use the lower two bits of the next pointer to keep PENDING and BUSY
- * flags.
 */
 #define IRQ_WORK_PENDING        1UL
 #define IRQ_WORK_BUSY           2UL
 #define IRQ_WORK_FLAGS          3UL
-static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+static DEFINE_PER_CPU(struct llist_head, irq_work_list);
-{
-        return (unsigned long)entry->next & flags;
-}
-static inline struct irq_work *irq_work_next(struct irq_work *entry)
-{
-        unsigned long next = (unsigned long)entry->next;
-        next &= ~IRQ_WORK_FLAGS;
-        return (struct irq_work *)next;
-}
-static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
-{
-        unsigned long next = (unsigned long)entry;
-        next |= flags;
-        return (struct irq_work *)next;
-}
-static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
 /*
 * Claim the entry so that no one else will poke at it.
 */
-static bool irq_work_claim(struct irq_work *entry)
+static bool irq_work_claim(struct irq_work *work)
 {
-        struct irq_work *next, *nflags;
+        unsigned long flags, nflags;
-        do {
+        for (;;) {
-                next = entry->next;
+                flags = work->flags;
-                if ((unsigned long)next & IRQ_WORK_PENDING)
+                if (flags & IRQ_WORK_PENDING)
                        return false;
-                nflags = next_flags(next, IRQ_WORK_FLAGS);
+                nflags = flags | IRQ_WORK_FLAGS;
-        } while (cmpxchg(&entry->next, next, nflags) != next);
+                if (cmpxchg(&work->flags, flags, nflags) == flags)
+                        break;
+                cpu_relax();
+        }
        return true;
 }
 void __weak arch_irq_work_raise(void)
 {
        /*
@@ -75,20 +57,15 @@ void __weak arch_irq_work_raise(void)
 /*
 * Queue the entry and raise the IPI if needed.
 */
-static void __irq_work_queue(struct irq_work *entry)
+static void __irq_work_queue(struct irq_work *work)
 {
-        struct irq_work *next;
+        bool empty;
        preempt_disable();
-        do {
+        empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-                next = __this_cpu_read(irq_work_list);
-                /* Can assign non-atomic because we keep the flags set. */
-                entry->next = next_flags(next, IRQ_WORK_FLAGS);
-        } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
        /* The list was empty, raise self-interrupt to start processing. */
-        if (!irq_work_next(entry))
+        if (empty)
                arch_irq_work_raise();
        preempt_enable();
@@ -100,16 +77,16 @@ static void __irq_work_queue(struct irq_work *entry)
 *
 * Can be re-enqueued while the callback is still in progress.
 */
-bool irq_work_queue(struct irq_work *entry)
+bool irq_work_queue(struct irq_work *work)
 {
-        if (!irq_work_claim(entry)) {
+        if (!irq_work_claim(work)) {
                /*
                 * Already enqueued, can't do!
                 */
                return false;
        }
-        __irq_work_queue(entry);
+        __irq_work_queue(work);
        return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +97,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
 */
 void irq_work_run(void)
 {
-        struct irq_work *list;
+        struct irq_work *work;
+        struct llist_head *this_list;
+        struct llist_node *llnode;
-        if (this_cpu_read(irq_work_list) == NULL)
+        this_list = &__get_cpu_var(irq_work_list);
+        if (llist_empty(this_list))
                return;
        BUG_ON(!in_irq());
        BUG_ON(!irqs_disabled());
-        list = this_cpu_xchg(irq_work_list, NULL);
+        llnode = llist_del_all(this_list);
+        while (llnode != NULL) {
-        while (list != NULL) {
+                work = llist_entry(llnode, struct irq_work, llnode);
-                struct irq_work *entry = list;
-                list = irq_work_next(list);
+                llnode = llist_next(llnode);
                /*
-                 * Clear the PENDING bit, after this point the @entry
+                 * Clear the PENDING bit, after this point the @work
                 * can be re-used.
                 */
-                entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+                work->flags = IRQ_WORK_BUSY;
-                entry->func(entry);
+                work->func(work);
                /*
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                (void)cmpxchg(&entry->next,
+                (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
-                              next_flags(NULL, IRQ_WORK_BUSY),
-                              NULL);
        }
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +133,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
-void irq_work_sync(struct irq_work *entry)
+void irq_work_sync(struct irq_work *work)
 {
        WARN_ON_ONCE(irqs_disabled());
-        while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+        while (work->flags & IRQ_WORK_BUSY)
                cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        cval = it->expires;
        cinterval = it->incr;
-        if (!cputime_eq(cval, cputime_zero)) {
+        if (cval) {
                struct task_cputime cputime;
                cputime_t t;
                thread_group_cputimer(tsk, &cputime);
                if (clock_id == CPUCLOCK_PROF)
-                        t = cputime_add(cputime.utime, cputime.stime);
+                        t = cputime.utime + cputime.stime;
                else
                        /* CPUCLOCK_VIRT */
                        t = cputime.utime;
-                if (cputime_le(cval, t))
+                if (cval < t)
                        /* about to fire */
                        cval = cputime_one_jiffy;
                else
-                        cval = cputime_sub(cval, t);
+                        cval = cval - t;
        }
        spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        cval = it->expires;
        cinterval = it->incr;
-        if (!cputime_eq(cval, cputime_zero) ||
+        if (cval || nval) {
-            !cputime_eq(nval, cputime_zero)) {
+                if (nval > 0)
-                if (cputime_gt(nval, cputime_zero))
+                        nval += cputime_one_jiffy;
-                        nval = cputime_add(nval, cputime_one_jiffy);
                set_process_cpu_timer(tsk, clock_id, &nval, &cval);
        }
        it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a8ce45097f3d..01d3b70fc98a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -66,19 +66,53 @@ void jump_label_inc(struct jump_label_key *key)
                return;
        jump_label_lock();
-        if (atomic_add_return(1, &key->enabled) == 1)
+        if (atomic_read(&key->enabled) == 0)
                jump_label_update(key, JUMP_LABEL_ENABLE);
+        atomic_inc(&key->enabled);
        jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_inc);
-void jump_label_dec(struct jump_label_key *key)
+static void __jump_label_dec(struct jump_label_key *key,
+                unsigned long rate_limit, struct delayed_work *work)
 {
        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
                return;
-        jump_label_update(key, JUMP_LABEL_DISABLE);
+        if (rate_limit) {
+                atomic_inc(&key->enabled);
+                schedule_delayed_work(work, rate_limit);
+        } else
+                jump_label_update(key, JUMP_LABEL_DISABLE);
        jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_dec);
+static void jump_label_update_timeout(struct work_struct *work)
+{
+        struct jump_label_key_deferred *key =
+                container_of(work, struct jump_label_key_deferred, work.work);
+        __jump_label_dec(&key->key, 0, NULL);
+}
+void jump_label_dec(struct jump_label_key *key)
+{
+        __jump_label_dec(key, 0, NULL);
+}
+void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+{
+        __jump_label_dec(&key->key, key->timeout, &key->work);
+}
+void jump_label_rate_limit(struct jump_label_key_deferred *key,
+                unsigned long rl)
+{
+        key->timeout = rl;
+        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
+}
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
@@ -104,6 +138,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
        return 0;
 }
+/* 
+ * Update code which is definitely not currently executing.
+ * Architectures which need heavyweight synchronization to modify
+ * running code can override this to make the non-live update case
+ * cheaper.
+ */
+void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
+                                            enum jump_label_type type)
+{
+        arch_jump_label_transform(entry, type); 
+}
 static void __jump_label_update(struct jump_label_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop, int enable)
@@ -121,14 +167,7 @@ static void __jump_label_update(struct jump_label_key *key,
        }
 }
-/*
+void __init jump_label_init(void)
- * Not all archs need this.
- */
-void __weak arch_jump_label_text_poke_early(jump_label_t addr)
-{
-}
-static __init int jump_label_init(void)
 {
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
@@ -139,22 +178,22 @@ static __init int jump_label_init(void)
        jump_label_sort_entries(iter_start, iter_stop);
        for (iter = iter_start; iter < iter_stop; iter++) {
-                arch_jump_label_text_poke_early(iter->code);
+                struct jump_label_key *iterk;
-                if (iter->key == (jump_label_t)(unsigned long)key)
+                iterk = (struct jump_label_key *)(unsigned long)iter->key;
+                arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+                                                 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+                if (iterk == key)
                        continue;
-                key = (struct jump_label_key *)(unsigned long)iter->key;
+                key = iterk;
-                atomic_set(&key->enabled, 0);
                key->entries = iter;
 #ifdef CONFIG_MODULES
                key->next = NULL;
 #endif
        }
        jump_label_unlock();
-        return 0;
 }
-early_initcall(jump_label_init);
 #ifdef CONFIG_MODULES
@@ -211,8 +250,13 @@ void jump_label_apply_nops(struct module *mod)
        if (iter_start == iter_stop)
                return;
-        for (iter = iter_start; iter < iter_stop; iter++)
+        for (iter = iter_start; iter < iter_stop; iter++) {
-                arch_jump_label_text_poke_early(iter->code);
+                struct jump_label_key *iterk;
+                iterk = (struct jump_label_key *)(unsigned long)iter->key;
+                arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+                                JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+        }
 }
 static int jump_label_add_module(struct module *mod)
@@ -252,8 +296,7 @@ static int jump_label_add_module(struct module *mod)
                key->next = jlm;
                if (jump_label_enabled(key))
-                        __jump_label_update(key, iter, iter_stop,
+                        __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
-                                            JUMP_LABEL_ENABLE);
        }
        return 0;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 296fbc84d659..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
 #include <linux/console.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
-#include <linux/kmsg_dump.h>
 #include <linux/syscore_ops.h>
 #include <asm/page.h>
@@ -498,7 +497,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
        while (hole_end <= crashk_res.end) {
                unsigned long i;
-                if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
+                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
                        break;
                if (hole_end > crashk_res.end)
                        break;
@@ -999,6 +998,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                        kimage_free(xchg(&kexec_crash_image, NULL));
                        result = kimage_crash_alloc(&image, entry,
                                                     nr_segments, segments);
+                        crash_map_reserved_pages();
                }
                if (result)
                        goto out;
@@ -1015,6 +1015,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                                goto out;
                }
                kimage_terminate(image);
+                if (flags & KEXEC_ON_CRASH)
+                        crash_unmap_reserved_pages();
        }
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
@@ -1026,6 +1028,18 @@ out:
        return result;
 }
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+void __weak crash_unmap_reserved_pages(void)
+{}
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_kexec_load(unsigned long entry,
                                unsigned long nr_segments,
@@ -1079,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
-                        kmsg_dump(KMSG_DUMP_KEXEC);
                        crash_setup_regs(&fixed_regs, regs);
                        crash_save_vmcoreinfo();
                        machine_crash_shutdown(&fixed_regs);
@@ -1117,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
 {
        int ret = 0;
        unsigned long start, end;
+        unsigned long old_size;
+        struct resource *ram_res;
        mutex_lock(&kexec_mutex);
@@ -1126,23 +1140,37 @@ int crash_shrink_memory(unsigned long new_size)
        }
        start = crashk_res.start;
        end = crashk_res.end;
+        old_size = (end == 0) ? 0 : end - start + 1;
+        if (new_size >= old_size) {
+                ret = (new_size == old_size) ? 0 : -EINVAL;
+                goto unlock;
+        }
-        if (new_size >= end - start + 1) {
+        ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-                ret = -EINVAL;
+        if (!ram_res) {
-                if (new_size == end - start + 1)
+                ret = -ENOMEM;
-                        ret = 0;
                goto unlock;
        }
-        start = roundup(start, PAGE_SIZE);
+        start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-        end = roundup(start + new_size, PAGE_SIZE);
+        end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+        crash_map_reserved_pages();
        crash_free_reserved_phys_range(end, crashk_res.end);
        if ((start == end) && (crashk_res.parent != NULL))
                release_resource(&crashk_res);
+        ram_res->start = end;
+        ram_res->end = crashk_res.end;
+        ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+        ram_res->name = "System RAM";
        crashk_res.end = end - 1;
+        insert_resource(&iomem_resource, ram_res);
+        crash_unmap_reserved_pages();
 unlock:
        mutex_unlock(&kexec_mutex);
        return ret;
@@ -1380,24 +1408,23 @@ int __init parse_crashkernel(char 		 *cmdline,
 }
+static void update_vmcoreinfo_note(void)
-void crash_save_vmcoreinfo(void)
 {
-        u32 *buf;
+        u32 *buf = vmcoreinfo_note;
        if (!vmcoreinfo_size)
                return;
-        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
-        buf = (u32 *)vmcoreinfo_note;
        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
                              vmcoreinfo_size);
        final_note(buf);
 }
+void crash_save_vmcoreinfo(void)
+{
+        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+        update_vmcoreinfo_note();
+}
 void vmcoreinfo_append_str(const char *fmt, ...)
 {
        va_list args;
@@ -1483,6 +1510,7 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_NUMBER(PG_swapcache);
        arch_crash_save_vmcoreinfo();
+        update_vmcoreinfo_note();
        return 0;
 }
@@ -1506,7 +1534,7 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {
-                mutex_lock(&pm_mutex);
+                lock_system_sleep();
                pm_prepare_console();
                error = freeze_processes();
                if (error) {
@@ -1559,7 +1587,7 @@ int kernel_kexec(void)
                thaw_processes();
 Restore_console:
                pm_restore_console();
-                mutex_unlock(&pm_mutex);
+                unlock_system_sleep();
        }
 #endif
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 01a0700e873f..c744b88c44e2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -20,7 +20,7 @@
 */
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/log2.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddc7644c1305..a0a88543934e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
 #include <linux/resource.h>
 #include <linux/notifier.h>
 #include <linux/suspend.h>
+#include <linux/rwsem.h>
 #include <asm/uaccess.h>
 #include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
 static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
 static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
 static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
 #ifdef CONFIG_MODULES
@@ -114,10 +116,12 @@ int __request_module(bool wait, const char *fmt, ...)
        atomic_inc(&kmod_concurrent);
        if (atomic_read(&kmod_concurrent) > max_modprobes) {
                /* We may be blaming an innocent here, but unlikely */
-                if (kmod_loop_msg++ < 5)
+                if (kmod_loop_msg < 5) {
                        printk(KERN_ERR
                               "request_module: runaway loop modprobe %s\n",
                               module_name);
+                        kmod_loop_msg++;
+                }
                atomic_dec(&kmod_concurrent);
                return -ENOMEM;
        }
@@ -273,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
 * land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
 */
 static int usermodehelper_disabled = 1;
@@ -280,17 +285,29 @@ static int usermodehelper_disabled = 1;
 static atomic_t running_helpers = ATOMIC_INIT(0);
 /*
- * Wait queue head used by usermodehelper_pm_callback() to wait for all running
+ * Wait queue head used by usermodehelper_disable() to wait for all running
 * helpers to finish.
 */
 static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
 /*
 * Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_pm_callback() fails
+ * usermodehelper_disabled in usermodehelper_disable() fails
 */
 #define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+void read_lock_usermodehelper(void)
+{
+        down_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
+void read_unlock_usermodehelper(void)
+{
+        up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
 /**
 * usermodehelper_disable - prevent new helpers from being started
 */
@@ -298,8 +315,10 @@ int usermodehelper_disable(void)
 {
        long retval;
+        down_write(&umhelper_sem);
        usermodehelper_disabled = 1;
-        smp_mb();
+        up_write(&umhelper_sem);
        /*
         * From now on call_usermodehelper_exec() won't start any new
         * helpers, so it is sufficient if running_helpers turns out to
@@ -312,7 +331,9 @@ int usermodehelper_disable(void)
        if (retval)
                return 0;
+        down_write(&umhelper_sem);
        usermodehelper_disabled = 0;
+        up_write(&umhelper_sem);
        return -EAGAIN;
 }
@@ -321,7 +342,9 @@ int usermodehelper_disable(void)
 */
 void usermodehelper_enable(void)
 {
+        down_write(&umhelper_sem);
        usermodehelper_disabled = 0;
+        up_write(&umhelper_sem);
 }
 /**
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b30fd54eb985..95dd7212e610 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
 static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
-        spinlock_t lock ____cacheline_aligned_in_smp;
+        raw_spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
-static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 {
        return &(kretprobe_table_locks[hash].lock);
 }
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
        hlist_del(&ri->hlist);
        INIT_HLIST_NODE(&ri->hlist);
        if (likely(rp)) {
-                spin_lock(&rp->lock);
+                raw_spin_lock(&rp->lock);
                hlist_add_head(&ri->hlist, &rp->free_instances);
-                spin_unlock(&rp->lock);
+                raw_spin_unlock(&rp->lock);
        } else
                /* Unregistering */
                hlist_add_head(&ri->hlist, head);
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 __acquires(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-        spinlock_t *hlist_lock;
+        raw_spinlock_t *hlist_lock;
        *head = &kretprobe_inst_table[hash];
        hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_lock_irqsave(hlist_lock, *flags);
+        raw_spin_lock_irqsave(hlist_lock, *flags);
 }
 static void __kprobes kretprobe_table_lock(unsigned long hash,
        unsigned long *flags)
 __acquires(hlist_lock)
 {
-        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_lock_irqsave(hlist_lock, *flags);
+        raw_spin_lock_irqsave(hlist_lock, *flags);
 }
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 __releases(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-        spinlock_t *hlist_lock;
+        raw_spinlock_t *hlist_lock;
        hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_unlock_irqrestore(hlist_lock, *flags);
+        raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
 static void __kprobes kretprobe_table_unlock(unsigned long hash,
       unsigned long *flags)
 __releases(hlist_lock)
 {
-        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_unlock_irqrestore(hlist_lock, *flags);
+        raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
 /*
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
        /*TODO: consider to only swap the RA after the last pre_handler fired */
        hash = hash_ptr(current, KPROBE_HASH_BITS);
-        spin_lock_irqsave(&rp->lock, flags);
+        raw_spin_lock_irqsave(&rp->lock, flags);
        if (!hlist_empty(&rp->free_instances)) {
                ri = hlist_entry(rp->free_instances.first,
                                struct kretprobe_instance, hlist);
                hlist_del(&ri->hlist);
-                spin_unlock_irqrestore(&rp->lock, flags);
+                raw_spin_unlock_irqrestore(&rp->lock, flags);
                ri->rp = rp;
                ri->task = current;
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                kretprobe_table_unlock(hash, &flags);
        } else {
                rp->nmissed++;
-                spin_unlock_irqrestore(&rp->lock, flags);
+                raw_spin_unlock_irqrestore(&rp->lock, flags);
        }
        return 0;
 }
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
                rp->maxactive = num_possible_cpus();
 #endif
        }
-        spin_lock_init(&rp->lock);
+        raw_spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
-                spin_lock_init(&(kretprobe_table_locks[i].lock));
+                raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
        }
        /*
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
               const char __user *user_buf, size_t count, loff_t *ppos)
 {
        char buf[32];
-        int buf_size;
+        size_t buf_size;
        buf_size = min(count, (sizeof(buf)-1));
        if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3b053c04dd86..4e316e1acf58 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -11,10 +11,11 @@
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/sysfs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
 #include <linux/profile.h>
+#include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ba7cccb4994..3d3de633702e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,7 +12,7 @@
 #include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/freezer.h>
@@ -59,6 +59,31 @@ int kthread_should_stop(void)
 EXPORT_SYMBOL(kthread_should_stop);
 /**
+ * kthread_freezable_should_stop - should this freezable kthread return now?
+ * @was_frozen: optional out parameter, indicates whether %current was frozen
+ *
+ * kthread_should_stop() for freezable kthreads, which will enter
+ * refrigerator if necessary.  This function is safe from kthread_stop() /
+ * freezer deadlock and freezable kthreads should use this function instead
+ * of calling try_to_freeze() directly.
+ */
+bool kthread_freezable_should_stop(bool *was_frozen)
+{
+        bool frozen = false;
+        might_sleep();
+        if (unlikely(freezing(current)))
+                frozen = __refrigerator(true);
+        if (was_frozen)
+                *was_frozen = frozen;
+        return kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
+/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
@@ -257,7 +282,7 @@ int kthreadd(void *unused)
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
        set_mems_allowed(node_states[N_HIGH_MEMORY]);
-        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
+        current->flags |= PF_NOFREEZE;
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 376066e10413..a462b317f9a0 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -53,12 +53,12 @@
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
-static DEFINE_SPINLOCK(latency_lock);
+static DEFINE_RAW_SPINLOCK(latency_lock);
 #define MAXLR 128
 static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
        if (!latencytop_enabled)
                return;
-        spin_lock_irqsave(&latency_lock, flags);
+        raw_spin_lock_irqsave(&latency_lock, flags);
        memset(&p->latency_record, 0, sizeof(p->latency_record));
        p->latency_record_count = 0;
-        spin_unlock_irqrestore(&latency_lock, flags);
+        raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 static void clear_global_latency_tracing(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&latency_lock, flags);
+        raw_spin_lock_irqsave(&latency_lock, flags);
        memset(&latency_record, 0, sizeof(latency_record));
-        spin_unlock_irqrestore(&latency_lock, flags);
+        raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        lat.max = usecs;
        store_stacktrace(tsk, &lat);
-        spin_lock_irqsave(&latency_lock, flags);
+        raw_spin_lock_irqsave(&latency_lock, flags);
        account_global_scheduler_latency(tsk, &lat);
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 out_unlock:
-        spin_unlock_irqrestore(&latency_lock, flags);
+        raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 91d67ce3a8d5..8889f7dd7c46 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -44,6 +44,7 @@
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/gfp.h>
+#include <linux/kmemcheck.h>
 #include <asm/sections.h>
@@ -96,8 +97,13 @@ static int graph_lock(void)
 static inline int graph_unlock(void)
 {
-        if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
+        if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
+                /*
+                 * The lockdep graph lock isn't locked while we expect it to
+                 * be, we're confused now, bye!
+                 */
                return DEBUG_LOCKS_WARN_ON(1);
+        }
        current->lockdep_recursion--;
        arch_spin_unlock(&lockdep_lock);
@@ -134,6 +140,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
 {
        if (!hlock->class_idx) {
+                /*
+                 * Someone passed in garbage, we give up.
+                 */
                DEBUG_LOCKS_WARN_ON(1);
                return NULL;
        }
@@ -422,6 +431,7 @@ unsigned int max_lockdep_depth;
 * about it later on, in lockdep_info().
 */
 static int lockdep_init_error;
+static const char *lock_init_error;
 static unsigned long lockdep_init_trace_data[20];
 static struct stack_trace lockdep_init_trace = {
        .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -490,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
        usage[i] = '\0';
 }
-static int __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct lock_class *class)
 {
        char str[KSYM_NAME_LEN];
        const char *name;
        name = class->name;
-        if (!name)
-                name = __get_key_name(class->key, str);
-        return printk("%s", name);
-}
-static void print_lock_name(struct lock_class *class)
-{
-        char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
-        const char *name;
-        get_usage_chars(class, usage);
-        name = class->name;
        if (!name) {
                name = __get_key_name(class->key, str);
-                printk(" (%s", name);
+                printk("%s", name);
        } else {
-                printk(" (%s", name);
+                printk("%s", name);
                if (class->name_version > 1)
                        printk("#%d", class->name_version);
                if (class->subclass)
                        printk("/%d", class->subclass);
        }
+}
+static void print_lock_name(struct lock_class *class)
+{
+        char usage[LOCK_USAGE_CHARS];
+        get_usage_chars(class, usage);
+        printk(" (");
+        __print_lock_name(class);
        printk("){%s}", usage);
 }
@@ -559,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
        }
 }
-static void print_kernel_version(void)
+static void print_kernel_ident(void)
 {
-        printk("%s %.*s\n", init_utsname()->release,
+        printk("%s %.*s %s\n", init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
-                init_utsname()->version);
+                init_utsname()->version,
+                print_tainted());
 }
 static int very_verbose(struct lock_class *class)
@@ -647,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        if (unlikely(!lockdep_initialized)) {
                lockdep_init();
                lockdep_init_error = 1;
+                lock_init_error = lock->name;
                save_stack_trace(&lockdep_init_trace);
        }
 #endif
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
         */
        list_for_each_entry(class, hash_head, hash_entry) {
                if (class->key == key) {
+                        /*
+                         * Huh! same key, different name? Did someone trample
+                         * on some memory? We're most confused.
+                         */
                        WARN_ON_ONCE(class->name != lock->name);
                        return class;
                }
@@ -710,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
-                return class;
+                goto out_set_class_cache;
        /*
         * Debug-check: all keys must be persistent!
@@ -795,11 +807,16 @@ out_unlock_set:
        graph_unlock();
        raw_local_irq_restore(flags);
+out_set_class_cache:
        if (!subclass || force)
                lock->class_cache[0] = class;
        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
                lock->class_cache[subclass] = class;
+        /*
+         * Hash collision, did we smoke some? We found a class with a matching
+         * hash but the subclass -- which is hashed in -- didn't match.
+         */
        if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                return NULL;
@@ -926,7 +943,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
        unsigned long nr;
        nr = lock - list_entries;
-        WARN_ON(nr >= nr_list_entries);
+        WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
        lock->parent = parent;
        lock->class->dep_gen_id = lockdep_dependency_gen_id;
 }
@@ -936,7 +953,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
        unsigned long nr;
        nr = lock - list_entries;
-        WARN_ON(nr >= nr_list_entries);
+        WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
        return lock->class->dep_gen_id == lockdep_dependency_gen_id;
 }
@@ -1129,10 +1146,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        if (debug_locks_silent)
                return 0;
-        printk("\n=======================================================\n");
+        printk("\n");
-        printk(  "[ INFO: possible circular locking dependency detected ]\n");
+        printk("======================================================\n");
-        print_kernel_version();
+        printk("[ INFO: possible circular locking dependency detected ]\n");
-        printk(  "-------------------------------------------------------\n");
+        print_kernel_ident();
+        printk("-------------------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(check_src);
@@ -1196,6 +1214,9 @@ static noinline int print_bfs_bug(int ret)
        if (!debug_locks_off_graph_unlock())
                return 0;
+        /*
+         * Breadth-first-search failed, graph got corrupted?
+         */
        WARN(1, "lockdep bfs error:%d\n", ret);
        return 0;
@@ -1463,11 +1484,12 @@ print_bad_irq_dependency(struct task_struct *curr,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n======================================================\n");
+        printk("\n");
-        printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+        printk("======================================================\n");
+        printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
                irqclass, irqclass);
-        print_kernel_version();
+        print_kernel_ident();
-        printk(  "------------------------------------------------------\n");
+        printk("------------------------------------------------------\n");
        printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
                curr->comm, task_pid_nr(curr),
                curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1692,10 +1714,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n=============================================\n");
+        printk("\n");
-        printk(  "[ INFO: possible recursive locking detected ]\n");
+        printk("=============================================\n");
-        print_kernel_version();
+        printk("[ INFO: possible recursive locking detected ]\n");
-        printk(  "---------------------------------------------\n");
+        print_kernel_ident();
+        printk("---------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(next);
@@ -1944,6 +1967,11 @@ out_bug:
        if (!debug_locks_off_graph_unlock())
                return 0;
+        /*
+         * Clearly we all shouldn't be here, but since we made it we
+         * can reliable say we messed up our state. See the above two
+         * gotos for reasons why we could possibly end up here.
+         */
        WARN_ON(1);
        return 0;
@@ -1975,6 +2003,11 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        struct held_lock *hlock_curr, *hlock_next;
        int i, j;
+        /*
+         * We might need to take the graph lock, ensure we've got IRQs
+         * disabled to make this an IRQ-safe lock.. for recursion reasons
+         * lockdep won't complain about its own locking errors.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
        /*
@@ -2126,6 +2159,10 @@ static void check_chain_key(struct task_struct *curr)
                hlock = curr->held_locks + i;
                if (chain_key != hlock->prev_chain_key) {
                        debug_locks_off();
+                        /*
+                         * We got mighty confused, our chain keys don't match
+                         * with what we expect, someone trample on our task state?
+                         */
                        WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
                                curr->lockdep_depth, i,
                                (unsigned long long)chain_key,
@@ -2133,6 +2170,9 @@ static void check_chain_key(struct task_struct *curr)
                        return;
                }
                id = hlock->class_idx - 1;
+                /*
+                 * Whoops ran out of static storage again?
+                 */
                if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                        return;
@@ -2144,6 +2184,10 @@ static void check_chain_key(struct task_struct *curr)
        }
        if (chain_key != curr->curr_chain_key) {
                debug_locks_off();
+                /*
+                 * More smoking hash instead of calculating it, damn see these
+                 * numbers float.. I bet that a pink elephant stepped on my memory.
+                 */
                WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
                        curr->lockdep_depth, i,
                        (unsigned long long)chain_key,
@@ -2177,10 +2221,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n=================================\n");
+        printk("\n");
-        printk(  "[ INFO: inconsistent lock state ]\n");
+        printk("=================================\n");
-        print_kernel_version();
+        printk("[ INFO: inconsistent lock state ]\n");
-        printk(  "---------------------------------\n");
+        print_kernel_ident();
+        printk("---------------------------------\n");
        printk("inconsistent {%s} -> {%s} usage.\n",
                usage_str[prev_bit], usage_str[new_bit]);
@@ -2241,10 +2286,11 @@ print_irq_inversion_bug(struct task_struct *curr,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n=========================================================\n");
+        printk("\n");
-        printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+        printk("=========================================================\n");
-        print_kernel_version();
+        printk("[ INFO: possible irq lock inversion dependency detected ]\n");
-        printk(  "---------------------------------------------------------\n");
+        print_kernel_ident();
+        printk("---------------------------------------------------------\n");
        printk("%s/%d just changed the state of lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(this);
@@ -2525,12 +2571,24 @@ void trace_hardirqs_on_caller(unsigned long ip)
                return;
        }
+        /*
+         * We're enabling irqs and according to our state above irqs weren't
+         * already enabled, yet we find the hardware thinks they are in fact
+         * enabled.. someone messed up their IRQ state tracing.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
+        /*
+         * See the fine text that goes along with this variable definition.
+         */
        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
                return;
+        /*
+         * Can't allow enabling interrupts while in an interrupt handler,
+         * that's general bad form and such. Recursion, limited stack etc..
+         */
        if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
                return;
@@ -2558,6 +2616,10 @@ void trace_hardirqs_off_caller(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
+        /*
+         * So we're supposed to get called after you mask local IRQs, but for
+         * some reason the hardware doesn't quite think you did a proper job.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2590,6 +2652,10 @@ void trace_softirqs_on(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
+        /*
+         * We fancy IRQs being disabled here, see softirq.c, avoids
+         * funny state and nesting things.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2626,6 +2692,9 @@ void trace_softirqs_off(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
+        /*
+         * We fancy IRQs being disabled here, see softirq.c
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2637,6 +2706,9 @@ void trace_softirqs_off(unsigned long ip)
                curr->softirq_disable_ip = ip;
                curr->softirq_disable_event = ++curr->irq_events;
                debug_atomic_inc(softirqs_off_events);
+                /*
+                 * Whoops, we wanted softirqs off, so why aren't they?
+                 */
                DEBUG_LOCKS_WARN_ON(!softirq_count());
        } else
                debug_atomic_inc(redundant_softirqs_off);
@@ -2661,6 +2733,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
        if (!(gfp_mask & __GFP_FS))
                return;
+        /*
+         * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
+         */
        if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
                return;
@@ -2773,13 +2848,13 @@ static int separate_irq_context(struct task_struct *curr,
        return 0;
 }
-#else
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 static inline
 int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit new_bit)
 {
-        WARN_ON(1);
+        WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
        return 1;
 }
@@ -2799,7 +2874,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
 {
 }
-#endif
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 /*
 * Mark a lock with a usage bit, and validate the state transition:
@@ -2874,12 +2949,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 void lockdep_init_map(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass)
 {
-        memset(lock, 0, sizeof(*lock));
+        int i;
+        kmemcheck_mark_initialized(lock, sizeof(*lock));
+        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+                lock->class_cache[i] = NULL;
 #ifdef CONFIG_LOCK_STAT
        lock->cpu = raw_smp_processor_id();
 #endif
+        /*
+         * Can't be having no nameless bastards around this place!
+         */
        if (DEBUG_LOCKS_WARN_ON(!name)) {
                lock->name = "NULL";
                return;
@@ -2887,6 +2970,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        lock->name = name;
+        /*
+         * No key, no joy, we need to hash something.
+         */
        if (DEBUG_LOCKS_WARN_ON(!key))
                return;
        /*
@@ -2894,6 +2980,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
         */
        if (!static_obj(key)) {
                printk("BUG: key %p not in .data!\n", key);
+                /*
+                 * What it says above ^^^^^, I suggest you read it.
+                 */
                DEBUG_LOCKS_WARN_ON(1);
                return;
        }
@@ -2932,6 +3021,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (unlikely(!debug_locks))
                return 0;
+        /*
+         * Lockdep should run with IRQs disabled, otherwise we could
+         * get an interrupt which would want to take locks, which would
+         * end up in lockdep and have you got a head-ache already?
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -2963,6 +3057,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * dependency checks are done)
         */
        depth = curr->lockdep_depth;
+        /*
+         * Ran out of static storage for our per-task lock stack again have we?
+         */
        if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
                return 0;
@@ -2981,6 +3078,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        }
        hlock = curr->held_locks + depth;
+        /*
+         * Plain impossible, we just registered it and checked it weren't no
+         * NULL like.. I bet this mushroom I ate was good!
+         */
        if (DEBUG_LOCKS_WARN_ON(!class))
                return 0;
        hlock->class_idx = class_idx;
@@ -3015,11 +3116,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * the hash, not class->key.
         */
        id = class - lock_classes;
+        /*
+         * Whoops, we did it again.. ran straight out of our static allocation.
+         */
        if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                return 0;
        chain_key = curr->curr_chain_key;
        if (!depth) {
+                /*
+                 * How can we have a chain hash when we ain't got no keys?!
+                 */
                if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
                        return 0;
                chain_head = 1;
@@ -3065,9 +3172,11 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
        if (debug_locks_silent)
                return 0;
-        printk("\n=====================================\n");
+        printk("\n");
-        printk(  "[ BUG: bad unlock balance detected! ]\n");
+        printk("=====================================\n");
-        printk(  "-------------------------------------\n");
+        printk("[ BUG: bad unlock balance detected! ]\n");
+        print_kernel_ident();
+        printk("-------------------------------------\n");
        printk("%s/%d is trying to release lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3091,6 +3200,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 {
        if (unlikely(!debug_locks))
                return 0;
+        /*
+         * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -3120,6 +3232,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                if (!class)
                        return 0;
+                /*
+                 * References, but not a lock we're actually ref-counting?
+                 * State got messed up, follow the sites that change ->references
+                 * and try to make sense of it.
+                 */
                if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
                        return 0;
@@ -3142,6 +3259,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
        int i;
        depth = curr->lockdep_depth;
+        /*
+         * This function is about (re)setting the class of a held lock,
+         * yet we're not actually holding any locks. Naughty user!
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
@@ -3177,6 +3298,10 @@ found_it:
                        return 0;
        }
+        /*
+         * I took it apart and put it back together again, except now I have
+         * these 'spare' parts.. where shall I put them.
+         */
        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
                return 0;
        return 1;
@@ -3201,6 +3326,10 @@ lock_release_non_nested(struct task_struct *curr,
         * of held locks:
         */
        depth = curr->lockdep_depth;
+        /*
+         * So we're all set to release this lock.. wait what lock? We don't
+         * own any locks, you've been drinking again?
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
@@ -3253,6 +3382,10 @@ found_it:
                        return 0;
        }
+        /*
+         * We had N bottles of beer on the wall, we drank one, but now
+         * there's not N-1 bottles of beer left on the wall...
+         */
        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
                return 0;
        return 1;
@@ -3283,6 +3416,9 @@ static int lock_release_nested(struct task_struct *curr,
                return lock_release_non_nested(curr, lock, ip);
        curr->lockdep_depth--;
+        /*
+         * No more locks, but somehow we've got hash left over, who left it?
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
                return 0;
@@ -3365,10 +3501,13 @@ static void check_flags(unsigned long flags)
         * check if not in hardirq contexts:
         */
        if (!hardirq_count()) {
-                if (softirq_count())
+                if (softirq_count()) {
+                        /* like the above, but with softirqs */
                        DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
-                else
+                } else {
+                        /* lick the above, does it taste good? */
                        DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+                }
        }
        if (!debug_locks)
@@ -3478,9 +3617,11 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
        if (debug_locks_silent)
                return 0;
-        printk("\n=================================\n");
+        printk("\n");
-        printk(  "[ BUG: bad contention detected! ]\n");
+        printk("=================================\n");
-        printk(  "---------------------------------\n");
+        printk("[ BUG: bad contention detected! ]\n");
+        print_kernel_ident();
+        printk("---------------------------------\n");
        printk("%s/%d is trying to contend lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3506,6 +3647,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
        int i, contention_point, contending_point;
        depth = curr->lockdep_depth;
+        /*
+         * Whee, we contended on this lock, except it seems we're not
+         * actually trying to acquire anything much at all..
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
@@ -3555,6 +3700,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
        int i, cpu;
        depth = curr->lockdep_depth;
+        /*
+         * Yay, we acquired ownership of this lock we didn't try to
+         * acquire, how the heck did that happen?
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
@@ -3759,8 +3908,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                                match |= class == lock->class_cache[j];
                        if (unlikely(match)) {
-                                if (debug_locks_off_graph_unlock())
+                                if (debug_locks_off_graph_unlock()) {
+                                        /*
+                                         * We all just reset everything, how did it match?
+                                         */
                                        WARN_ON(1);
+                                }
                                goto out_restore;
                        }
                }
@@ -3823,7 +3976,8 @@ void __init lockdep_info(void)
 #ifdef CONFIG_DEBUG_LOCKDEP
        if (lockdep_init_error) {
-                printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
+                printk("WARNING: lockdep init error! lock-%s was acquired"
+                        "before lockdep_init\n", lock_init_error);
                printk("Call stack leading to lockdep invocation was:\n");
                print_stack_trace(&lockdep_init_trace, 0);
        }
@@ -3839,9 +3993,11 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
        if (debug_locks_silent)
                return;
-        printk("\n=========================\n");
+        printk("\n");
-        printk(  "[ BUG: held lock freed! ]\n");
+        printk("=========================\n");
-        printk(  "-------------------------\n");
+        printk("[ BUG: held lock freed! ]\n");
+        print_kernel_ident();
+        printk("-------------------------\n");
        printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
                curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
        print_lock(hlock);
@@ -3895,9 +4051,11 @@ static void print_held_locks_bug(struct task_struct *curr)
        if (debug_locks_silent)
                return;
-        printk("\n=====================================\n");
+        printk("\n");
-        printk(  "[ BUG: lock held at task exit time! ]\n");
+        printk("=====================================\n");
-        printk(  "-------------------------------------\n");
+        printk("[ BUG: lock held at task exit time! ]\n");
+        print_kernel_ident();
+        printk("-------------------------------------\n");
        printk("%s/%d is exiting with locks still held!\n",
                curr->comm, task_pid_nr(curr));
        lockdep_print_held_locks(curr);
@@ -3991,16 +4149,18 @@ void lockdep_sys_exit(void)
        if (unlikely(curr->lockdep_depth)) {
                if (!debug_locks_off())
                        return;
-                printk("\n================================================\n");
+                printk("\n");
-                printk(  "[ BUG: lock held when returning to user space! ]\n");
+                printk("================================================\n");
-                printk(  "------------------------------------------------\n");
+                printk("[ BUG: lock held when returning to user space! ]\n");
+                print_kernel_ident();
+                printk("------------------------------------------------\n");
                printk("%s/%d is leaving the kernel with locks still held!\n",
                                curr->comm, curr->pid);
                lockdep_print_held_locks(curr);
        }
 }
-void lockdep_rcu_dereference(const char *file, const int line)
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 {
        struct task_struct *curr = current;
@@ -4009,15 +4169,38 @@ void lockdep_rcu_dereference(const char *file, const int line)
                return;
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
        /* Note: the following can be executed concurrently, so be careful. */
-        printk("\n===================================================\n");
+        printk("\n");
-        printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
+        printk("===============================\n");
-        printk(  "---------------------------------------------------\n");
+        printk("[ INFO: suspicious RCU usage. ]\n");
-        printk("%s:%d invoked rcu_dereference_check() without protection!\n",
+        print_kernel_ident();
-                        file, line);
+        printk("-------------------------------\n");
+        printk("%s:%d %s!\n", file, line, s);
        printk("\nother info that might help us debug this:\n\n");
        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+        /*
+         * If a CPU is in the RCU-free window in idle (ie: in the section
+         * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+         * considers that CPU to be in an "extended quiescent state",
+         * which means that RCU will be completely ignoring that CPU.
+         * Therefore, rcu_read_lock() and friends have absolutely no
+         * effect on a CPU running in that state. In other words, even if
+         * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
+         * delete data structures out from under it.  RCU really has no
+         * choice here: we need to keep an RCU-free window in idle where
+         * the CPU may possibly enter into low power mode. This way we can
+         * notice an extended quiescent state to other CPUs that started a grace
+         * period. Otherwise we would delay any grace period as long as we run
+         * in the idle task.
+         *
+         * So complain bitterly if someone does call rcu_read_lock(),
+         * rcu_read_lock_bh() and so on from extended quiescent states.
+         */
+        if (rcu_is_cpu_idle())
+                printk("RCU used illegally from extended quiescent state!\n");
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
 }
-EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
+EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 71edd2f60c02..91c32a0b612c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -11,7 +11,7 @@
 * Code for /proc/lockdep and /proc/lockdep_stats:
 *
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
diff --git a/kernel/module.c b/kernel/module.c
index 04379f92f843..acf6ed3ebe81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -16,7 +16,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/moduleloader.h>
 #include <linux/ftrace_event.h>
 #include <linux/init.h>
@@ -62,12 +62,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt , a...)
-#endif
 #ifndef ARCH_SHF_SMALL
 #define ARCH_SHF_SMALL 0
 #endif
@@ -138,7 +132,6 @@ struct load_info {
        unsigned long len;
        Elf_Shdr *sechdrs;
        char *secstrings, *strtab;
-        unsigned long *strmap;
        unsigned long symoffs, stroffs;
        struct _ddebug *debug;
        unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
                return fsa.sym;
        }
-        DEBUGP("Failed to find symbol %s\n", name);
+        pr_debug("Failed to find symbol %s\n", name);
        return NULL;
 }
 EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
        list_for_each_entry(use, &b->source_list, source_list) {
                if (use->source == a) {
-                        DEBUGP("%s uses %s!\n", a->name, b->name);
+                        pr_debug("%s uses %s!\n", a->name, b->name);
                        return 1;
                }
        }
-        DEBUGP("%s does not use %s!\n", a->name, b->name);
+        pr_debug("%s does not use %s!\n", a->name, b->name);
        return 0;
 }
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
 {
        struct module_use *use;
-        DEBUGP("Allocating new usage for %s.\n", a->name);
+        pr_debug("Allocating new usage for %s.\n", a->name);
        use = kmalloc(sizeof(*use), GFP_ATOMIC);
        if (!use) {
                printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
        mutex_lock(&module_mutex);
        list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
                struct module *i = use->target;
-                DEBUGP("%s unusing %s\n", mod->name, i->name);
+                pr_debug("%s unusing %s\n", mod->name, i->name);
                module_put(i);
                list_del(&use->source_list);
                list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
        }
 }
-unsigned int module_refcount(struct module *mod)
+unsigned long module_refcount(struct module *mod)
 {
-        unsigned int incs = 0, decs = 0;
+        unsigned long incs = 0, decs = 0;
        int cpu;
        for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
        /* Since we might sleep for some time, release the mutex first */
        mutex_unlock(&module_mutex);
        for (;;) {
-                DEBUGP("Looking at refcount...\n");
+                pr_debug("Looking at refcount...\n");
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (module_refcount(mod) == 0)
                        break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
        if (mod->state != MODULE_STATE_LIVE) {
                /* FIXME: if (force), slam module count and wake up
                   waiter --RR */
-                DEBUGP("%s already dying\n", mod->name);
+                pr_debug("%s already dying\n", mod->name);
                ret = -EBUSY;
                goto out;
        }
@@ -849,12 +842,32 @@ out:
        return ret;
 }
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+        size_t l = 0;
+        if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+                buf[l++] = 'P';
+        if (mod->taints & (1 << TAINT_OOT_MODULE))
+                buf[l++] = 'O';
+        if (mod->taints & (1 << TAINT_FORCED_MODULE))
+                buf[l++] = 'F';
+        if (mod->taints & (1 << TAINT_CRAP))
+                buf[l++] = 'C';
+        /*
+         * TAINT_FORCED_RMMOD: could be added.
+         * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+         * apply to modules.
+         */
+        return l;
+}
 static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
        struct module_use *use;
        int printed_something = 0;
-        seq_printf(m, " %u ", module_refcount(mod));
+        seq_printf(m, " %lu ", module_refcount(mod));
        /* Always include a trailing , so userspace can differentiate
           between this and the old multi-field proc format. */
@@ -904,13 +917,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
                           struct module_kobject *mk, char *buffer)
 {
-        return sprintf(buffer, "%u\n", module_refcount(mk->mod));
+        return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
 }
-static struct module_attribute refcnt = {
+static struct module_attribute modinfo_refcnt =
-        .attr = { .name = "refcnt", .mode = 0444 },
+        __ATTR(refcnt, 0444, show_refcnt, NULL);
-        .show = show_refcnt,
-};
 void module_put(struct module *module)
 {
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
        return sprintf(buffer, "%s\n", state);
 }
-static struct module_attribute initstate = {
+static struct module_attribute modinfo_initstate =
-        .attr = { .name = "initstate", .mode = 0444 },
+        __ATTR(initstate, 0444, show_initstate, NULL);
-        .show = show_initstate,
-};
 static ssize_t store_uevent(struct module_attribute *mattr,
                            struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
        return count;
 }
-struct module_attribute module_uevent = {
+struct module_attribute module_uevent =
-        .attr = { .name = "uevent", .mode = 0200 },
+        __ATTR(uevent, 0200, NULL, store_uevent);
-        .store = store_uevent,
-};
+static ssize_t show_coresize(struct module_attribute *mattr,
+                             struct module_kobject *mk, char *buffer)
+{
+        return sprintf(buffer, "%u\n", mk->mod->core_size);
+}
+static struct module_attribute modinfo_coresize =
+        __ATTR(coresize, 0444, show_coresize, NULL);
+static ssize_t show_initsize(struct module_attribute *mattr,
+                             struct module_kobject *mk, char *buffer)
+{
+        return sprintf(buffer, "%u\n", mk->mod->init_size);
+}
+static struct module_attribute modinfo_initsize =
+        __ATTR(initsize, 0444, show_initsize, NULL);
+static ssize_t show_taint(struct module_attribute *mattr,
+                          struct module_kobject *mk, char *buffer)
+{
+        size_t l;
+        l = module_flags_taint(mk->mod, buffer);
+        buffer[l++] = '\n';
+        return l;
+}
+static struct module_attribute modinfo_taint =
+        __ATTR(taint, 0444, show_taint, NULL);
 static struct module_attribute *modinfo_attrs[] = {
+        &module_uevent,
        &modinfo_version,
        &modinfo_srcversion,
-        &initstate,
+        &modinfo_initstate,
-        &module_uevent,
+        &modinfo_coresize,
+        &modinfo_initsize,
+        &modinfo_taint,
 #ifdef CONFIG_MODULE_UNLOAD
-        &refcnt,
+        &modinfo_refcnt,
 #endif
        NULL,
 };
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
                if (versions[i].crc == maybe_relocated(*crc, crc_owner))
                        return 1;
-                DEBUGP("Found checksum %lX vs module %lX\n",
+                pr_debug("Found checksum %lX vs module %lX\n",
                       maybe_relocated(*crc, crc_owner), versions[i].crc);
                goto bad_version;
        }
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                case SHN_COMMON:
                        /* We compiled with -fno-common.  These are not
                           supposed to happen.  */
-                        DEBUGP("Common symbol: %s\n", name);
+                        pr_debug("Common symbol: %s\n", name);
                        printk("%s: please compile with -fno-common\n",
                               mod->name);
                        ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
                case SHN_ABS:
                        /* Don't need to do anything */
-                        DEBUGP("Absolute symbol: 0x%08lx\n",
+                        pr_debug("Absolute symbol: 0x%08lx\n",
                               (long)sym[i].st_value);
                        break;
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
        for (i = 0; i < info->hdr->e_shnum; i++)
                info->sechdrs[i].sh_entsize = ~0UL;
-        DEBUGP("Core section allocation order:\n");
+        pr_debug("Core section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                            || strstarts(sname, ".init"))
                                continue;
                        s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
-                        DEBUGP("\t%s\n", name);
+                        pr_debug("\t%s\n", sname);
                }
                switch (m) {
                case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                }
        }
-        DEBUGP("Init section allocation order:\n");
+        pr_debug("Init section allocation order:\n");
        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                continue;
                        s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
                                         | INIT_OFFSET_MASK);
-                        DEBUGP("\t%s\n", sname);
+                        pr_debug("\t%s\n", sname);
                }
                switch (m) {
                case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
        return true;
 }
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep.  This is simple, but has the effect of making multiple
+ * copies of duplicates.  We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
 static void layout_symtab(struct module *mod, struct load_info *info)
 {
        Elf_Shdr *symsect = info->sechdrs + info->index.sym;
        Elf_Shdr *strsect = info->sechdrs + info->index.str;
        const Elf_Sym *src;
-        unsigned int i, nsrc, ndst;
+        unsigned int i, nsrc, ndst, strtab_size;
        /* Put symbol section at end of init part of module. */
        symsect->sh_flags |= SHF_ALLOC;
        symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
                                         info->index.sym) | INIT_OFFSET_MASK;
-        DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
+        pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
        src = (void *)info->hdr + symsect->sh_offset;
        nsrc = symsect->sh_size / sizeof(*src);
-        for (ndst = i = 1; i < nsrc; ++i, ++src)
-                if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-                        unsigned int j = src->st_name;
-                        while (!__test_and_set_bit(j, info->strmap)
+        /* Compute total space required for the core symbols' strtab. */
-                               && info->strtab[j])
+        for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
-                                ++j;
+                if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
-                        ++ndst;
+                        strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+                        ndst++;
                }
        /* Append room for core symbols at end of core part. */
        info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
-        mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+        info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+        mod->core_size += strtab_size;
        /* Put string table section at end of init part of module. */
        strsect->sh_flags |= SHF_ALLOC;
        strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
                                         info->index.str) | INIT_OFFSET_MASK;
-        DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
+        pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
-        /* Append room for core symbols' strings at end of core part. */
-        info->stroffs = mod->core_size;
-        __set_bit(0, info->strmap);
-        mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
 }
 static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
                mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
        mod->core_symtab = dst = mod->module_core + info->symoffs;
+        mod->core_strtab = s = mod->module_core + info->stroffs;
        src = mod->symtab;
        *dst = *src;
+        *s++ = 0;
        for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
                if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
                        continue;
                dst[ndst] = *src;
-                dst[ndst].st_name = bitmap_weight(info->strmap,
+                dst[ndst++].st_name = s - mod->core_strtab;
-                                                  dst[ndst].st_name);
+                s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
-                ++ndst;
        }
        mod->core_num_syms = ndst;
-        mod->core_strtab = s = mod->module_core + info->stroffs;
-        for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
-                if (test_bit(i, info->strmap))
-                        *++s = mod->strtab[i];
 }
 #else
 static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2487,6 +2526,9 @@ static int check_modinfo(struct module *mod, struct load_info *info)
                return -ENOEXEC;
        }
+        if (!get_modinfo(info, "intree"))
+                add_taint_module(mod, TAINT_OOT_MODULE);
        if (get_modinfo(info, "staging")) {
                add_taint_module(mod, TAINT_CRAP);
                printk(KERN_WARNING "%s: module is from the staging directory,"
@@ -2618,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
        mod->module_init = ptr;
        /* Transfer each section which specifies SHF_ALLOC */
-        DEBUGP("final section addresses:\n");
+        pr_debug("final section addresses:\n");
        for (i = 0; i < info->hdr->e_shnum; i++) {
                void *dest;
                Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2636,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
                        memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
                /* Update sh_addr to point to copy in image. */
                shdr->sh_addr = (unsigned long)dest;
-                DEBUGP("\t0x%lx %s\n",
+                pr_debug("\t0x%lx %s\n",
-                       shdr->sh_addr, info->secstrings + shdr->sh_name);
+                         (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
        }
        return 0;
@@ -2739,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
           this is done generically; there doesn't appear to be any
           special cases for the architectures. */
        layout_sections(mod, info);
-        info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
-                         * sizeof(long), GFP_KERNEL);
-        if (!info->strmap) {
-                err = -ENOMEM;
-                goto free_percpu;
-        }
        layout_symtab(mod, info);
        /* Allocate and move to the final place */
        err = move_module(mod, info);
        if (err)
-                goto free_strmap;
+                goto free_percpu;
        /* Module has been copied to its final place now: return it. */
        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
        kmemleak_load_module(mod, info);
        return mod;
-free_strmap:
-        kfree(info->strmap);
 free_percpu:
        percpu_modfree(mod);
 out:
@@ -2769,7 +2802,6 @@ out:
 /* mod is no longer valid after this! */
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
-        kfree(info->strmap);
        percpu_modfree(mod);
        module_free(mod, mod->module_init);
        module_free(mod, mod->module_core);
@@ -2808,7 +2840,7 @@ static struct module *load_module(void __user *umod,
        struct module *mod;
        long err;
-        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+        pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
               umod, len, uargs);
        /* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2878,8 +2910,7 @@ static struct module *load_module(void __user *umod,
        }
        /* This has to be done once we're sure module name is unique. */
-        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
+        dynamic_debug_setup(info.debug, info.num_debug);
-                dynamic_debug_setup(info.debug, info.num_debug);
        /* Find duplicate symbols */
        err = verify_export_symbols(mod);
@@ -2900,8 +2931,7 @@ static struct module *load_module(void __user *umod,
        if (err < 0)
                goto unlink;
-        /* Get rid of temporary copy and strmap. */
+        /* Get rid of temporary copy. */
-        kfree(info.strmap);
        free_copy(&info);
        /* Done! */
@@ -2915,8 +2945,7 @@ static struct module *load_module(void __user *umod,
        module_bug_cleanup(mod);
 ddebug:
-        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
+        dynamic_debug_remove(info.debug);
-                dynamic_debug_remove(info.debug);
 unlock:
        mutex_unlock(&module_mutex);
        synchronize_sched();
@@ -3255,18 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
            mod->state == MODULE_STATE_GOING ||
            mod->state == MODULE_STATE_COMING) {
                buf[bx++] = '(';
-                if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+                bx += module_flags_taint(mod, buf + bx);
-                        buf[bx++] = 'P';
-                if (mod->taints & (1 << TAINT_FORCED_MODULE))
-                        buf[bx++] = 'F';
-                if (mod->taints & (1 << TAINT_CRAP))
-                        buf[bx++] = 'C';
-                /*
-                 * TAINT_FORCED_RMMOD: could be added.
-                 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-                 * apply to modules.
-                 */
                /* Show a - for module-is-being-unloaded */
                if (mod->state == MODULE_STATE_GOING)
                        buf[bx++] = '-';
@@ -3487,50 +3505,3 @@ void module_layout(struct module *mod,
 }
 EXPORT_SYMBOL(module_layout);
 #endif
-#ifdef CONFIG_TRACEPOINTS
-void module_update_tracepoints(void)
-{
-        struct module *mod;
-        mutex_lock(&module_mutex);
-        list_for_each_entry(mod, &modules, list)
-                if (!mod->taints)
-                        tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                                mod->tracepoints_ptrs + mod->num_tracepoints);
-        mutex_unlock(&module_mutex);
-}
-/*
- * Returns 0 if current not found.
- * Returns 1 if current found.
- */
-int module_get_iter_tracepoints(struct tracepoint_iter *iter)
-{
-        struct module *iter_mod;
-        int found = 0;
-        mutex_lock(&module_mutex);
-        list_for_each_entry(iter_mod, &modules, list) {
-                if (!iter_mod->taints) {
-                        /*
-                         * Sorted module list
-                         */
-                        if (iter_mod < iter->module)
-                                continue;
-                        else if (iter_mod > iter->module)
-                                iter->tracepoint = NULL;
-                        found = tracepoint_get_iter_range(&iter->tracepoint,
-                                iter_mod->tracepoints_ptrs,
-                                iter_mod->tracepoints_ptrs
-                                        + iter_mod->num_tracepoints);
-                        if (found) {
-                                iter->module = iter_mod;
-                                break;
-                        }
-                }
-        }
-        mutex_unlock(&module_mutex);
-        return found;
-}
-#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 73da83aff418..7e3443fe1f48 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -14,7 +14,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d607ed5dd441..89096dd8786f 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,7 +19,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 8d7b435806c9..2d5cc4ccff7f 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,6 +1,6 @@
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9aeab4b98c64..b576f7f14bc6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,7 @@
 */
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
diff --git a/kernel/padata.c b/kernel/padata.c
index b91941df5e63..b45259931512 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -18,7 +18,7 @@
 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/cpumask.h>
 #include <linux/err.h>
 #include <linux/cpu.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index d7bb6974efb5..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
 long (*panic_blink)(int state);
 EXPORT_SYMBOL(panic_blink);
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+        while (1)
+                cpu_relax();
+}
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
 *
 *      This function never returns.
 */
-NORET_TYPE void panic(const char * fmt, ...)
+void panic(const char *fmt, ...)
 {
+        static DEFINE_SPINLOCK(panic_lock);
        static char buf[1024];
        va_list args;
        long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
+         *
+         * Only one CPU is allowed to execute the panic code from here. For
+         * multiple parallel invocations of panic, all other CPUs either
+         * stop themself or will wait until they are stopped by the 1st CPU
+         * with smp_send_stop().
         */
-        preempt_disable();
+        if (!spin_trylock(&panic_lock))
+                panic_smp_self_stop();
        console_verbose();
        bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
        va_end(args);
        printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-        dump_stack();
+        /*
+         * Avoid nested stack-dumping if a panic occurs during oops processing
+         */
+        if (!oops_in_progress)
+                dump_stack();
 #endif
        /*
@@ -177,6 +197,7 @@ static const struct tnt tnts[] = {
        { TAINT_WARN,                   'W', ' ' },
        { TAINT_CRAP,                   'C', ' ' },
        { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
+        { TAINT_OOT_MODULE,             'O', ' ' },
 };
 /**
@@ -194,6 +215,7 @@ static const struct tnt tnts[] = {
 *  'W' - Taint on warning.
 *  'C' - modules from drivers/staging are loaded.
 *  'I' - Working around severe firmware bug.
+ *  'O' - Out-of-tree module has been loaded.
 *
 *      The string is overwritten by the next call to print_tainted().
 */
@@ -235,11 +257,20 @@ void add_taint(unsigned flag)
         * Can't trust the integrity of the kernel anymore.
         * We don't call directly debug_locks_off() because the issue
         * is not necessarily serious enough to set oops_in_progress to 1
-         * Also we want to keep up lockdep for staging development and
+         * Also we want to keep up lockdep for staging/out-of-tree
-         * post-warning case.
+         * development and post-warning case.
         */
-        if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
+        switch (flag) {
-                printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+        case TAINT_CRAP:
+        case TAINT_OOT_MODULE:
+        case TAINT_WARN:
+        case TAINT_FIRMWARE_WORKAROUND:
+                break;
+        default:
+                if (__debug_locks_off())
+                        printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+        }
        set_bit(flag, &tainted_mask);
 }
diff --git a/kernel/params.c b/kernel/params.c
index 22df3e0d142a..32ee04308285 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/moduleparam.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
@@ -25,12 +25,6 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt, a...)
-#endif
 /* Protects all parameters, and incidentally kmalloced_param list. */
 static DEFINE_MUTEX(param_lock);
@@ -67,20 +61,27 @@ static void maybe_kfree_parameter(void *param)
        }
 }
-static inline char dash2underscore(char c)
+static char dash2underscore(char c)
 {
        if (c == '-')
                return '_';
        return c;
 }
-static inline int parameq(const char *input, const char *paramname)
+bool parameqn(const char *a, const char *b, size_t n)
 {
-        unsigned int i;
+        size_t i;
-        for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
-                if (input[i] == '\0')
+        for (i = 0; i < n; i++) {
-                        return 1;
+                if (dash2underscore(a[i]) != dash2underscore(b[i]))
-        return 0;
+                        return false;
+        }
+        return true;
+}
+bool parameq(const char *a, const char *b)
+{
+        return parameqn(a, b, strlen(a)+1);
 }
 static int parse_one(char *param,
@@ -98,7 +99,7 @@ static int parse_one(char *param,
                        /* No one handled NULL, so do it here. */
                        if (!val && params[i].ops->set != param_set_bool)
                                return -EINVAL;
-                        DEBUGP("They are equal!  Calling %p\n",
+                        pr_debug("They are equal!  Calling %p\n",
                               params[i].ops->set);
                        mutex_lock(&param_lock);
                        err = params[i].ops->set(val, &params[i]);
@@ -108,11 +109,11 @@ static int parse_one(char *param,
        }
        if (handle_unknown) {
-                DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+                pr_debug("Unknown argument: calling %p\n", handle_unknown);
                return handle_unknown(param, val);
        }
-        DEBUGP("Unknown argument `%s'\n", param);
+        pr_debug("Unknown argument `%s'\n", param);
        return -ENOENT;
 }
@@ -177,7 +178,7 @@ int parse_args(const char *name,
 {
        char *param, *val;
-        DEBUGP("Parsing ARGS: %s\n", args);
+        pr_debug("Parsing ARGS: %s\n", args);
        /* Chew leading spaces */
        args = skip_spaces(args);
@@ -362,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
 };
 EXPORT_SYMBOL(param_ops_invbool);
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+        struct kernel_param boolkp;
+        bool v;
+        int ret;
+        /* Match bool exactly, by re-using it. */
+        boolkp = *kp;
+        boolkp.arg = &v;
+        boolkp.flags |= KPARAM_ISBOOL;
+        ret = param_set_bool(val, &boolkp);
+        if (ret == 0)
+                *(int *)kp->arg = v;
+        return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+struct kernel_param_ops param_ops_bint = {
+        .set = param_set_bint,
+        .get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
                       const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index e432057f3b21..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -27,7 +27,7 @@
 */
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
 }
 /*
- * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
 * We want the winner to have the "later" value, because if the
 * "earlier" value prevails, then a pid may get reused immediately.
 *
@@ -418,7 +420,9 @@ EXPORT_SYMBOL(pid_task);
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-        rcu_lockdep_assert(rcu_read_lock_held());
+        rcu_lockdep_assert(rcu_read_lock_held(),
+                           "find_task_by_pid_ns() needs rcu_read_lock()"
+                           " protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        return;
 }
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        struct ctl_table tmp = *table;
+        if (write && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /*
+         * Writing directly to ns' last_pid field is OK, since this field
+         * is volatile in a living namespace anyway and a code writing to
+         * it should synchronize its usage with external means.
+         */
+        tmp.data = &current->nsproxy->pid_ns->last_pid;
+        return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+static struct ctl_table pid_ns_ctl_table[] = {
+        {
+                .procname = "ns_last_pid",
+                .maxlen = sizeof(int),
+                .mode = 0666, /* permissions are checked in the handler */
+                .proc_handler = pid_ns_ctl_handler,
+        },
+        { }
+};
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+        register_sysctl_paths(kern_path, pid_ns_ctl_table);
        return 0;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 640ded8f5c48..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                return now.sched < then.sched;
        }  else {
-                return cputime_lt(now.cpu, then.cpu);
+                return now.cpu < then.cpu;
        }
 }
 static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                acc->sched += val.sched;
        }  else {
-                acc->cpu = cputime_add(acc->cpu, val.cpu);
+                acc->cpu += val.cpu;
        }
 }
 static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                a.sched -= b.sched;
        }  else {
-                a.cpu = cputime_sub(a.cpu, b.cpu);
+                a.cpu -= b.cpu;
        }
        return a;
 }
 /*
- * Divide and limit the result to res >= 1
- *
- * This is necessary to prevent signal delivery starvation, when the result of
- * the division would be rounded down to 0.
- */
-static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
-{
-        cputime_t res = cputime_div(time, div);
-        return max_t(cputime_t, res, 1);
-}
-/*
 * Update expiry time from increment, and increase overrun count,
 * given the current clock sample.
 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
        } else {
                cputime_t delta, incr;
-                if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+                if (now.cpu < timer->it.cpu.expires.cpu)
                        return;
                incr = timer->it.cpu.incr.cpu;
-                delta = cputime_sub(cputime_add(now.cpu, incr),
+                delta = now.cpu + incr - timer->it.cpu.expires.cpu;
-                                    timer->it.cpu.expires.cpu);
                /* Don't use (incr*2 < delta), incr*2 might overflow. */
-                for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
+                for (i = 0; incr < delta - incr; i++)
-                             incr = cputime_add(incr, incr);
+                             incr += incr;
-                for (; i >= 0; incr = cputime_halve(incr), i--) {
+                for (; i >= 0; incr = incr >> 1, i--) {
-                        if (cputime_lt(delta, incr))
+                        if (delta < incr)
                                continue;
-                        timer->it.cpu.expires.cpu =
+                        timer->it.cpu.expires.cpu += incr;
-                                cputime_add(timer->it.cpu.expires.cpu, incr);
                        timer->it_overrun += 1 << i;
-                        delta = cputime_sub(delta, incr);
+                        delta -= incr;
                }
        }
 }
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
-        return cputime_add(p->utime, p->stime);
+        return p->utime + p->stime;
 }
 static inline cputime_t virt_ticks(struct task_struct *p)
 {
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        t = tsk;
        do {
-                times->utime = cputime_add(times->utime, t->utime);
+                times->utime += t->utime;
-                times->stime = cputime_add(times->stime, t->stime);
+                times->stime += t->stime;
                times->sum_exec_runtime += task_sched_runtime(t);
        } while_each_thread(tsk, t);
 out:
@@ -258,10 +243,10 @@ out:
 static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
 {
-        if (cputime_gt(b->utime, a->utime))
+        if (b->utime > a->utime)
                a->utime = b->utime;
-        if (cputime_gt(b->stime, a->stime))
+        if (b->stime > a->stime)
                a->stime = b->stime;
        if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -282,13 +267,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
                 * it.
                 */
                thread_group_cputime(tsk, &sum);
-                spin_lock_irqsave(&cputimer->lock, flags);
+                raw_spin_lock_irqsave(&cputimer->lock, flags);
                cputimer->running = 1;
                update_gt_cputime(&cputimer->cputime, &sum);
        } else
-                spin_lock_irqsave(&cputimer->lock, flags);
+                raw_spin_lock_irqsave(&cputimer->lock, flags);
        *times = cputimer->cputime;
-        spin_unlock_irqrestore(&cputimer->lock, flags);
+        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 /*
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
                return -EINVAL;
        case CPUCLOCK_PROF:
                thread_group_cputime(p, &cputime);
-                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+                cpu->cpu = cputime.utime + cputime.stime;
                break;
        case CPUCLOCK_VIRT:
                thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
                           unsigned long long sum_exec_runtime)
 {
        struct cpu_timer_list *timer, *next;
-        cputime_t ptime = cputime_add(utime, stime);
+        cputime_t ptime = utime + stime;
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-                if (cputime_lt(timer->expires.cpu, ptime)) {
+                if (timer->expires.cpu < ptime) {
-                        timer->expires.cpu = cputime_zero;
+                        timer->expires.cpu = 0;
                } else {
-                        timer->expires.cpu = cputime_sub(timer->expires.cpu,
+                        timer->expires.cpu -= ptime;
-                                                         ptime);
                }
        }
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-                if (cputime_lt(timer->expires.cpu, utime)) {
+                if (timer->expires.cpu < utime) {
-                        timer->expires.cpu = cputime_zero;
+                        timer->expires.cpu = 0;
                } else {
-                        timer->expires.cpu = cputime_sub(timer->expires.cpu,
+                        timer->expires.cpu -= utime;
-                                                         utime);
                }
        }
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
        struct signal_struct *const sig = tsk->signal;
        cleanup_timers(tsk->signal->cpu_timers,
-                       cputime_add(tsk->utime, sig->utime),
+                       tsk->utime + sig->utime, tsk->stime + sig->stime,
-                       cputime_add(tsk->stime, sig->stime),
                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 {
-        return cputime_eq(expires, cputime_zero) ||
+        return expires == 0 || expires > new_exp;
-               cputime_gt(expires, new_exp);
 }
 /*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
-                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+                cpu->cpu = cputime.utime + cputime.stime;
                break;
        case CPUCLOCK_VIRT:
                cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
        unsigned long soft;
        maxfire = 20;
-        tsk->cputime_expires.prof_exp = cputime_zero;
+        tsk->cputime_expires.prof_exp = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
                        tsk->cputime_expires.prof_exp = t->expires.cpu;
                        break;
                }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        tsk->cputime_expires.virt_exp = cputime_zero;
+        tsk->cputime_expires.virt_exp = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
                        tsk->cputime_expires.virt_exp = t->expires.cpu;
                        break;
                }
@@ -999,9 +980,9 @@ static void stop_process_timers(struct signal_struct *sig)
        struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
-        spin_lock_irqsave(&cputimer->lock, flags);
+        raw_spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
-        spin_unlock_irqrestore(&cputimer->lock, flags);
+        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 static u32 onecputick;
@@ -1009,20 +990,19 @@ static u32 onecputick;
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                             cputime_t *expires, cputime_t cur_time, int signo)
 {
-        if (cputime_eq(it->expires, cputime_zero))
+        if (!it->expires)
                return;
-        if (cputime_ge(cur_time, it->expires)) {
+        if (cur_time >= it->expires) {
-                if (!cputime_eq(it->incr, cputime_zero)) {
+                if (it->incr) {
-                        it->expires = cputime_add(it->expires, it->incr);
+                        it->expires += it->incr;
                        it->error += it->incr_error;
                        if (it->error >= onecputick) {
-                                it->expires = cputime_sub(it->expires,
+                                it->expires -= cputime_one_jiffy;
-                                                          cputime_one_jiffy);
                                it->error -= onecputick;
                        }
                } else {
-                        it->expires = cputime_zero;
+                        it->expires = 0;
                }
                trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
        }
-        if (!cputime_eq(it->expires, cputime_zero) &&
+        if (it->expires && (!*expires || it->expires < *expires)) {
-            (cputime_eq(*expires, cputime_zero) ||
-             cputime_lt(it->expires, *expires))) {
                *expires = it->expires;
        }
 }
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 */
 static inline int task_cputime_zero(const struct task_cputime *cputime)
 {
-        if (cputime_eq(cputime->utime, cputime_zero) &&
+        if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
-            cputime_eq(cputime->stime, cputime_zero) &&
-            cputime->sum_exec_runtime == 0)
                return 1;
        return 0;
 }
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
         */
        thread_group_cputimer(tsk, &cputime);
        utime = cputime.utime;
-        ptime = cputime_add(utime, cputime.stime);
+        ptime = utime + cputime.stime;
        sum_sched_runtime = cputime.sum_exec_runtime;
        maxfire = 20;
-        prof_expires = cputime_zero;
+        prof_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
+                if (!--maxfire || ptime < tl->expires.cpu) {
                        prof_expires = tl->expires.cpu;
                        break;
                }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        virt_expires = cputime_zero;
+        virt_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
+                if (!--maxfire || utime < tl->expires.cpu) {
                        virt_expires = tl->expires.cpu;
                        break;
                }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
                        }
                }
                x = secs_to_cputime(soft);
-                if (cputime_eq(prof_expires, cputime_zero) ||
+                if (!prof_expires || x < prof_expires) {
-                    cputime_lt(x, prof_expires)) {
                        prof_expires = x;
                }
        }
@@ -1249,12 +1224,9 @@ out:
 static inline int task_cputime_expired(const struct task_cputime *sample,
                                        const struct task_cputime *expires)
 {
-        if (!cputime_eq(expires->utime, cputime_zero) &&
+        if (expires->utime && sample->utime >= expires->utime)
-            cputime_ge(sample->utime, expires->utime))
                return 1;
-        if (!cputime_eq(expires->stime, cputime_zero) &&
+        if (expires->stime && sample->utime + sample->stime >= expires->stime)
-            cputime_ge(cputime_add(sample->utime, sample->stime),
-                       expires->stime))
                return 1;
        if (expires->sum_exec_runtime != 0 &&
            sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1291,9 +1263,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        if (sig->cputimer.running) {
                struct task_cputime group_sample;
-                spin_lock(&sig->cputimer.lock);
+                raw_spin_lock(&sig->cputimer.lock);
                group_sample = sig->cputimer.cputime;
-                spin_unlock(&sig->cputimer.lock);
+                raw_spin_unlock(&sig->cputimer.lock);
                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
                        return 1;
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                 * it to be relative, *newval argument is relative and we update
                 * it to be absolute.
                 */
-                if (!cputime_eq(*oldval, cputime_zero)) {
+                if (*oldval) {
-                        if (cputime_le(*oldval, now.cpu)) {
+                        if (*oldval <= now.cpu) {
                                /* Just about to fire. */
                                *oldval = cputime_one_jiffy;
                        } else {
-                                *oldval = cputime_sub(*oldval, now.cpu);
+                                *oldval -= now.cpu;
                        }
                }
-                if (cputime_eq(*newval, cputime_zero))
+                if (!*newval)
                        return;
-                *newval = cputime_add(*newval, now.cpu);
+                *newval += now.cpu;
        }
        /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4556182527f3..69185ae6b701 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,7 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
-#include <linux/module.h>
+#include <linux/export.h>
 /*
 * Management arrays for POSIX timers.   Timers are kept in slab memory
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3744c594b19b..deb5461e3216 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,7 @@ config HIBERNATION
        select HIBERNATE_CALLBACKS
        select LZO_COMPRESS
        select LZO_DECOMPRESS
+        select CRC32
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
@@ -65,6 +66,9 @@ config HIBERNATION
          For more information take a look at <file:Documentation/power/swsusp.txt>.
+config ARCH_SAVE_PAGE_KEYS
+        bool
 config PM_STD_PARTITION
        string "Default resume partition"
        depends on HIBERNATION
@@ -235,3 +239,7 @@ config PM_GENERIC_DOMAINS
 config PM_GENERIC_DOMAINS_RUNTIME
        def_bool y
        depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+config CPU_PM
+        bool
+        depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a90643..07e0e28ffba7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,8 +1,8 @@
 ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
-obj-$(CONFIG_PM)                += main.o
+obj-$(CONFIG_PM)                += main.o qos.o
-obj-$(CONFIG_PM_SLEEP)          += console.o
+obj-$(CONFIG_VT_CONSOLE_SLEEP)  += console.o
 obj-$(CONFIG_FREEZER)           += process.o
 obj-$(CONFIG_SUSPEND)           += suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 218e5af90156..b1dc456474b5 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
 /*
- * drivers/power/process.c - Functions for saving/restoring console.
+ * Functions for saving/restoring console.
 *
 * Originally from swsusp.
 */
@@ -10,7 +10,6 @@
 #include <linux/module.h>
 #include "power.h"
-#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
 static int orig_fgconsole, orig_kmsg;
@@ -32,4 +31,3 @@ void pm_restore_console(void)
                vt_kmsg_redirect(orig_kmsg);
        }
 }
-#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8f7b1db1ece1..6d6d28870335 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,11 +9,13 @@
 * This file is released under the GPLv2.
 */
+#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
+#include <linux/async.h>
 #include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
@@ -29,18 +31,18 @@
 #include "power.h"
-static int nocompress = 0;
+static int nocompress;
-static int noresume = 0;
+static int noresume;
+static int resume_wait;
+static int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
-int in_suspend __nosavedata = 0;
+int in_suspend __nosavedata;
 enum {
        HIBERNATION_INVALID,
        HIBERNATION_PLATFORM,
-        HIBERNATION_TEST,
-        HIBERNATION_TESTPROC,
        HIBERNATION_SHUTDOWN,
        HIBERNATION_REBOOT,
        /* keep last */
@@ -51,6 +53,8 @@ enum {
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
+bool freezer_test_done;
 static const struct platform_hibernation_ops *hibernation_ops;
 /**
@@ -65,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
                WARN_ON(1);
                return;
        }
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        hibernation_ops = ops;
        if (ops)
                hibernation_mode = HIBERNATION_PLATFORM;
        else if (hibernation_mode == HIBERNATION_PLATFORM)
                hibernation_mode = HIBERNATION_SHUTDOWN;
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
 }
 static bool entering_platform_hibernation;
@@ -90,15 +94,6 @@ static void hibernation_debug_sleep(void)
        mdelay(5000);
 }
-static int hibernation_testmode(int mode)
-{
-        if (hibernation_mode == mode) {
-                hibernation_debug_sleep();
-                return 1;
-        }
-        return 0;
-}
 static int hibernation_test(int level)
 {
        if (pm_test_level == level) {
@@ -108,7 +103,6 @@ static int hibernation_test(int level)
        return 0;
 }
 #else /* !CONFIG_PM_DEBUG */
-static int hibernation_testmode(int mode) { return 0; }
 static int hibernation_test(int level) { return 0; }
 #endif /* !CONFIG_PM_DEBUG */
@@ -272,8 +266,7 @@ static int create_image(int platform_mode)
                goto Platform_finish;
        error = disable_nonboot_cpus();
-        if (error || hibernation_test(TEST_CPUS)
+        if (error || hibernation_test(TEST_CPUS))
-            || hibernation_testmode(HIBERNATION_TEST))
                goto Enable_cpus;
        local_irq_disable();
@@ -327,38 +320,54 @@ static int create_image(int platform_mode)
 */
 int hibernation_snapshot(int platform_mode)
 {
-        pm_message_t msg = PMSG_RECOVER;
+        pm_message_t msg;
        int error;
        error = platform_begin(platform_mode);
        if (error)
                goto Close;
-        error = dpm_prepare(PMSG_FREEZE);
-        if (error)
-                goto Complete_devices;
        /* Preallocate image memory before shutting down devices. */
        error = hibernate_preallocate_memory();
        if (error)
-                goto Complete_devices;
+                goto Close;
+        error = freeze_kernel_threads();
+        if (error)
+                goto Cleanup;
+        if (hibernation_test(TEST_FREEZER)) {
+                /*
+                 * Indicate to the caller that we are returning due to a
+                 * successful freezer test.
+                 */
+                freezer_test_done = true;
+                goto Cleanup;
+        }
+        error = dpm_prepare(PMSG_FREEZE);
+        if (error) {
+                dpm_complete(PMSG_RECOVER);
+                goto Cleanup;
+        }
        suspend_console();
        pm_restrict_gfp_mask();
        error = dpm_suspend(PMSG_FREEZE);
-        if (error)
-                goto Recover_platform;
-        if (hibernation_test(TEST_DEVICES))
+        if (error || hibernation_test(TEST_DEVICES))
-                goto Recover_platform;
+                platform_recover(platform_mode);
+        else
+                error = create_image(platform_mode);
-        error = create_image(platform_mode);
        /*
-         * Control returns here (1) after the image has been created or the
+         * In the case that we call create_image() above, the control
+         * returns here (1) after the image has been created or the
         * image creation has failed and (2) after a successful restore.
         */
- Resume_devices:
        /* We may need to release the preallocated image pages here. */
        if (error || !in_suspend)
                swsusp_free();
@@ -370,17 +379,15 @@ int hibernation_snapshot(int platform_mode)
                pm_restore_gfp_mask();
        resume_console();
- Complete_devices:
        dpm_complete(msg);
 Close:
        platform_end(platform_mode);
        return error;
- Recover_platform:
+ Cleanup:
-        platform_recover(platform_mode);
+        swsusp_free();
-        goto Resume_devices;
+        goto Close;
 }
 /**
@@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode)
 * @platform_mode: If set, use platform driver to prepare for the transition.
 *
 * This routine must be called with pm_mutex held.  If it is successful, control
- * reappears in the restored target kernel in hibernation_snaphot().
+ * reappears in the restored target kernel in hibernation_snapshot().
 */
 int hibernation_restore(int platform_mode)
 {
@@ -565,9 +572,6 @@ int hibernation_platform_enter(void)
 static void power_down(void)
 {
        switch (hibernation_mode) {
-        case HIBERNATION_TEST:
-        case HIBERNATION_TESTPROC:
-                break;
        case HIBERNATION_REBOOT:
                kernel_restart(NULL);
                break;
@@ -586,17 +590,6 @@ static void power_down(void)
        while(1);
 }
-static int prepare_processes(void)
-{
-        int error = 0;
-        if (freeze_processes()) {
-                error = -EBUSY;
-                thaw_processes();
-        }
-        return error;
-}
 /**
 * hibernate - Carry out system hibernation, including saving the image.
 */
@@ -604,7 +597,7 @@ int hibernate(void)
 {
        int error;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        /* The snapshot device should not be opened while we're running */
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
                error = -EBUSY;
@@ -629,19 +622,17 @@ int hibernate(void)
        sys_sync();
        printk("done.\n");
-        error = prepare_processes();
+        error = freeze_processes();
        if (error)
                goto Finish;
-        if (hibernation_test(TEST_FREEZER))
-                goto Thaw;
-        if (hibernation_testmode(HIBERNATION_TESTPROC))
-                goto Thaw;
        error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
        if (error)
                goto Thaw;
+        if (freezer_test_done) {
+                freezer_test_done = false;
+                goto Thaw;
+        }
        if (in_suspend) {
                unsigned int flags = 0;
@@ -650,6 +641,9 @@ int hibernate(void)
                        flags |= SF_PLATFORM_MODE;
                if (nocompress)
                        flags |= SF_NOCOMPRESS_MODE;
+                else
+                        flags |= SF_CRC32_MODE;
                pr_debug("PM: writing image.\n");
                error = swsusp_write(flags);
                swsusp_free();
@@ -671,7 +665,7 @@ int hibernate(void)
        pm_restore_console();
        atomic_inc(&snapshot_device_available);
 Unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error;
 }
@@ -724,6 +718,12 @@ static int software_resume(void)
        pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+        if (resume_delay) {
+                printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+                        resume_delay);
+                ssleep(resume_delay);
+        }
        /* Check if the device is there */
        swsusp_resume_device = name_to_dev_t(resume_file);
        if (!swsusp_resume_device) {
@@ -732,6 +732,13 @@ static int software_resume(void)
                 * to wait for this to finish.
                 */
                wait_for_device_probe();
+                if (resume_wait) {
+                        while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
+                                msleep(10);
+                        async_synchronize_full();
+                }
                /*
                 * We can't depend on SCSI devices being available after loading
                 * one of their modules until scsi_complete_async_scans() is
@@ -772,11 +779,13 @@ static int software_resume(void)
                goto close_finish;
        error = create_basic_memory_bitmaps();
-        if (error)
+        if (error) {
+                usermodehelper_enable();
                goto close_finish;
+        }
        pr_debug("PM: Preparing processes for restore.\n");
-        error = prepare_processes();
+        error = freeze_processes();
        if (error) {
                swsusp_close(FMODE_READ);
                goto Done;
@@ -816,8 +825,6 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_PLATFORM]  = "platform",
        [HIBERNATION_SHUTDOWN]  = "shutdown",
        [HIBERNATION_REBOOT]    = "reboot",
-        [HIBERNATION_TEST]      = "test",
-        [HIBERNATION_TESTPROC]  = "testproc",
 };
 /*
@@ -826,17 +833,15 @@ static const char * const hibernation_modes[] = {
 * Hibernation can be handled in several ways.  There are a few different ways
 * to put the system into the sleep state: using the platform driver (e.g. ACPI
 * or other hibernation_ops), powering it off or rebooting it (for testing
- * mostly), or using one of the two available test modes.
+ * mostly).
 *
 * The sysfs file /sys/power/disk provides an interface for selecting the
 * hibernation mode to use.  Reading from this file causes the available modes
- * to be printed.  There are 5 modes that can be supported:
+ * to be printed.  There are 3 modes that can be supported:
 *
 *      'platform'
 *      'shutdown'
 *      'reboot'
- *      'test'
- *      'testproc'
 *
 * If a platform hibernation driver is in use, 'platform' will be supported
 * and will be used by default.  Otherwise, 'shutdown' will be used by default.
@@ -860,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
                switch (i) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
-                case HIBERNATION_TEST:
-                case HIBERNATION_TESTPROC:
                        break;
                case HIBERNATION_PLATFORM:
                        if (hibernation_ops)
@@ -890,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
                if (len == strlen(hibernation_modes[i])
                    && !strncmp(buf, hibernation_modes[i], len)) {
@@ -902,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                switch (mode) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
-                case HIBERNATION_TEST:
-                case HIBERNATION_TESTPROC:
                        hibernation_mode = mode;
                        break;
                case HIBERNATION_PLATFORM:
@@ -918,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (!error)
                pr_debug("PM: Hibernation mode set to '%s'\n",
                         hibernation_modes[mode]);
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error ? error : n;
 }
@@ -945,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (maj != MAJOR(res) || min != MINOR(res))
                goto out;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        swsusp_resume_device = res;
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        printk(KERN_INFO "PM: Starting manual resume from disk\n");
        noresume = 0;
        software_resume();
@@ -1060,7 +1061,21 @@ static int __init noresume_setup(char *str)
        return 1;
 }
+static int __init resumewait_setup(char *str)
+{
+        resume_wait = 1;
+        return 1;
+}
+static int __init resumedelay_setup(char *str)
+{
+        resume_delay = simple_strtoul(str, NULL, 0);
+        return 1;
+}
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
+__setup("resumewait", resumewait_setup);
+__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f871964..9824b41e5a18 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,15 +3,18 @@
 *
 * Copyright (c) 2003 Patrick Mochel
 * Copyright (c) 2003 Open Source Development Lab
- * 
+ *
 * This file is released under the GPLv2
 *
 */
+#include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/resume-trace.h>
 #include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include "power.h"
@@ -113,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        level = TEST_FIRST;
        for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -123,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
                        break;
                }
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error ? error : n;
 }
@@ -131,6 +134,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 power_attr(pm_test);
 #endif /* CONFIG_PM_DEBUG */
+#ifdef CONFIG_DEBUG_FS
+static char *suspend_step_name(enum suspend_stat_step step)
+{
+        switch (step) {
+        case SUSPEND_FREEZE:
+                return "freeze";
+        case SUSPEND_PREPARE:
+                return "prepare";
+        case SUSPEND_SUSPEND:
+                return "suspend";
+        case SUSPEND_SUSPEND_NOIRQ:
+                return "suspend_noirq";
+        case SUSPEND_RESUME_NOIRQ:
+                return "resume_noirq";
+        case SUSPEND_RESUME:
+                return "resume";
+        default:
+                return "";
+        }
+}
+static int suspend_stats_show(struct seq_file *s, void *unused)
+{
+        int i, index, last_dev, last_errno, last_step;
+        last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+        last_dev %= REC_FAILED_NUM;
+        last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+        last_errno %= REC_FAILED_NUM;
+        last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+        last_step %= REC_FAILED_NUM;
+        seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+                        "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+                        "success", suspend_stats.success,
+                        "fail", suspend_stats.fail,
+                        "failed_freeze", suspend_stats.failed_freeze,
+                        "failed_prepare", suspend_stats.failed_prepare,
+                        "failed_suspend", suspend_stats.failed_suspend,
+                        "failed_suspend_noirq",
+                                suspend_stats.failed_suspend_noirq,
+                        "failed_resume", suspend_stats.failed_resume,
+                        "failed_resume_noirq",
+                                suspend_stats.failed_resume_noirq);
+        seq_printf(s,   "failures:\n  last_failed_dev:\t%-s\n",
+                        suspend_stats.failed_devs[last_dev]);
+        for (i = 1; i < REC_FAILED_NUM; i++) {
+                index = last_dev + REC_FAILED_NUM - i;
+                index %= REC_FAILED_NUM;
+                seq_printf(s, "\t\t\t%-s\n",
+                        suspend_stats.failed_devs[index]);
+        }
+        seq_printf(s,   "  last_failed_errno:\t%-d\n",
+                        suspend_stats.errno[last_errno]);
+        for (i = 1; i < REC_FAILED_NUM; i++) {
+                index = last_errno + REC_FAILED_NUM - i;
+                index %= REC_FAILED_NUM;
+                seq_printf(s, "\t\t\t%-d\n",
+                        suspend_stats.errno[index]);
+        }
+        seq_printf(s,   "  last_failed_step:\t%-s\n",
+                        suspend_step_name(
+                                suspend_stats.failed_steps[last_step]));
+        for (i = 1; i < REC_FAILED_NUM; i++) {
+                index = last_step + REC_FAILED_NUM - i;
+                index %= REC_FAILED_NUM;
+                seq_printf(s, "\t\t\t%-s\n",
+                        suspend_step_name(
+                                suspend_stats.failed_steps[index]));
+        }
+        return 0;
+}
+static int suspend_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, suspend_stats_show, NULL);
+}
+static const struct file_operations suspend_stats_operations = {
+        .open           = suspend_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int __init pm_debugfs_init(void)
+{
+        debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
+                        NULL, NULL, &suspend_stats_operations);
+        return 0;
+}
+late_initcall(pm_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
 #endif /* CONFIG_PM_SLEEP */
 struct kobject *power_kobj;
@@ -142,7 +240,7 @@ struct kobject *power_kobj;
 *      'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
 *      'disk' (Suspend-to-Disk).
 *
- *      store() accepts one of those strings, translates it into the 
+ *      store() accepts one of those strings, translates it into the
 *      proper enumerated value, and initiates a suspend transition.
 */
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -184,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
        /* First, check if we are requested to hibernate */
        if (len == 4 && !strncmp(buf, "disk", len)) {
                error = hibernate();
-  goto Exit;
+                goto Exit;
        }
 #ifdef CONFIG_SUSPEND
@@ -192,8 +290,14 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
                        break;
        }
-        if (state < PM_SUSPEND_MAX && *s)
+        if (state < PM_SUSPEND_MAX && *s) {
                error = enter_state(state);
+                if (error) {
+                        suspend_stats.fail++;
+                        dpm_save_failed_errno(error);
+                } else
+                        suspend_stats.success++;
+        }
 #endif
 Exit:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a26280..0c4defe6d3b8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
 #define SPARE_PAGES     ((1024 * 1024) >> PAGE_SHIFT)
 /* kernel/power/hibernate.c */
+extern bool freezer_test_done;
 extern int hibernation_snapshot(int platform_mode);
 extern int hibernation_restore(int platform_mode);
 extern int hibernation_platform_enter(void);
@@ -146,6 +148,7 @@ extern int swsusp_swap_in_use(void);
 */
 #define SF_PLATFORM_MODE        1
 #define SF_NOCOMPRESS_MODE      2
+#define SF_CRC32_MODE           4
 /* kernel/power/hibernate.c */
 extern int swsusp_check(void);
@@ -228,7 +231,8 @@ extern int pm_test_level;
 #ifdef CONFIG_SUSPEND_FREEZER
 static inline int suspend_freeze_processes(void)
 {
-        return freeze_processes();
+        int error = freeze_processes();
+        return error ? : freeze_kernel_threads();
 }
 static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9d..77274c9ba2f1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,16 +22,7 @@
 */
 #define TIMEOUT (20 * HZ)
-static inline int freezable(struct task_struct * p)
+static int try_to_freeze_tasks(bool user_only)
-{
-        if ((p == current) ||
-            (p->flags & PF_NOFREEZE) ||
-            (p->exit_state != 0))
-                return 0;
-        return 1;
-}
-static int try_to_freeze_tasks(bool sig_only)
 {
        struct task_struct *g, *p;
        unsigned long end_time;
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
        end_time = jiffies + TIMEOUT;
-        if (!sig_only)
+        if (!user_only)
                freeze_workqueues_begin();
        while (true) {
                todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
-                        if (frozen(p) || !freezable(p))
+                        if (p == current || !freeze_task(p))
-                                continue;
-                        if (!freeze_task(p, sig_only))
                                continue;
                        /*
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                if (!sig_only) {
+                if (!user_only) {
                        wq_busy = freeze_workqueues_busy();
                        todo += wq_busy;
                }
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
        elapsed_csecs = elapsed_csecs64;
        if (todo) {
-                /* This does not unfreeze processes that are already frozen
-                 * (we have slightly ugly calling convention in that respect,
-                 * and caller must call thaw_processes() if something fails),
-                 * but it cleans up leftover PF_FREEZE requests.
-                 */
                printk("\n");
                printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
                       "(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
                       elapsed_csecs / 100, elapsed_csecs % 100,
                       todo - wq_busy, wq_busy);
-                thaw_workqueues();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
-                        task_lock(p);
+                        if (!wakeup && !freezer_should_skip(p) &&
-                        if (!wakeup && freezing(p) && !freezer_should_skip(p))
+                            p != current && freezing(p) && !frozen(p))
                                sched_show_task(p);
-                        cancel_freezing(p);
-                        task_unlock(p);
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
        } else {
@@ -135,60 +114,76 @@ static int try_to_freeze_tasks(bool sig_only)
 }
 /**
- *      freeze_processes - tell processes to enter the refrigerator
+ * freeze_processes - Signal user space processes to enter the refrigerator.
+ *
+ * On success, returns 0.  On failure, -errno and system is fully thawed.
 */
 int freeze_processes(void)
 {
        int error;
+        if (!pm_freezing)
+                atomic_inc(&system_freezing_cnt);
        printk("Freezing user space processes ... ");
+        pm_freezing = true;
        error = try_to_freeze_tasks(true);
+        if (!error) {
+                printk("done.");
+                oom_killer_disable();
+        }
+        printk("\n");
+        BUG_ON(in_atomic());
        if (error)
-                goto Exit;
+                thaw_processes();
-        printk("done.\n");
+        return error;
+}
+/**
+ * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ *
+ * On success, returns 0.  On failure, -errno and system is fully thawed.
+ */
+int freeze_kernel_threads(void)
+{
+        int error;
        printk("Freezing remaining freezable tasks ... ");
+        pm_nosig_freezing = true;
        error = try_to_freeze_tasks(false);
-        if (error)
+        if (!error)
-                goto Exit;
+                printk("done.");
-        printk("done.");
-        oom_killer_disable();
- Exit:
-        BUG_ON(in_atomic());
        printk("\n");
+        BUG_ON(in_atomic());
+        if (error)
+                thaw_processes();
        return error;
 }
-static void thaw_tasks(bool nosig_only)
+void thaw_processes(void)
 {
        struct task_struct *g, *p;
-        read_lock(&tasklist_lock);
+        if (pm_freezing)
-        do_each_thread(g, p) {
+                atomic_dec(&system_freezing_cnt);
-                if (!freezable(p))
+        pm_freezing = false;
-                        continue;
+        pm_nosig_freezing = false;
-                if (nosig_only && should_send_signal(p))
+        oom_killer_enable();
-                        continue;
-                if (cgroup_freezing_or_frozen(p))
+        printk("Restarting tasks ... ");
-                        continue;
-                thaw_process(p);
+        thaw_workqueues();
+        read_lock(&tasklist_lock);
+        do_each_thread(g, p) {
+                __thaw_task(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
-}
-void thaw_processes(void)
-{
-        oom_killer_enable();
-        printk("Restarting tasks ... ");
-        thaw_workqueues();
-        thaw_tasks(true);
-        thaw_tasks(false);
        schedule();
        printk("done.\n");
 }
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c
index 37f05d0f0793..995e3bd3417b 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/power/qos.c
@@ -29,7 +29,7 @@
 /*#define DEBUG*/
-#include <linux/pm_qos_params.h>
+#include <linux/pm_qos.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -43,64 +43,61 @@
 #include <linux/kernel.h>
 #include <linux/uaccess.h>
+#include <linux/export.h>
 /*
- * locking rule: all changes to requests or notifiers lists
+ * locking rule: all changes to constraints or notifiers lists
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
-enum pm_qos_type {
-        PM_QOS_MAX,             /* return the largest value */
-        PM_QOS_MIN              /* return the smallest value */
-};
-/*
- * Note: The lockless read path depends on the CPU accessing
- * target_value atomically.  Atomic access is only guaranteed on all CPU
- * types linux supports for 32 bit quantites
- */
 struct pm_qos_object {
-        struct plist_head requests;
+        struct pm_qos_constraints *constraints;
-        struct blocking_notifier_head *notifiers;
        struct miscdevice pm_qos_power_miscdev;
        char *name;
-        s32 target_value;       /* Do not change to 64 bit */
-        s32 default_value;
-        enum pm_qos_type type;
 };
 static DEFINE_SPINLOCK(pm_qos_lock);
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_object cpu_dma_pm_qos = {
+static struct pm_qos_constraints cpu_dma_constraints = {
-        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
+        .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
-        .notifiers = &cpu_dma_lat_notifier,
-        .name = "cpu_dma_latency",
        .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
        .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN,
+        .notifiers = &cpu_dma_lat_notifier,
+};
+static struct pm_qos_object cpu_dma_pm_qos = {
+        .constraints = &cpu_dma_constraints,
+        .name = "cpu_dma_latency",
 };
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_object network_lat_pm_qos = {
+static struct pm_qos_constraints network_lat_constraints = {
-        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
+        .list = PLIST_HEAD_INIT(network_lat_constraints.list),
-        .notifiers = &network_lat_notifier,
-        .name = "network_latency",
        .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
        .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
-        .type = PM_QOS_MIN
+        .type = PM_QOS_MIN,
+        .notifiers = &network_lat_notifier,
+};
+static struct pm_qos_object network_lat_pm_qos = {
+        .constraints = &network_lat_constraints,
+        .name = "network_latency",
 };
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_object network_throughput_pm_qos = {
+static struct pm_qos_constraints network_tput_constraints = {
-        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
+        .list = PLIST_HEAD_INIT(network_tput_constraints.list),
-        .notifiers = &network_throughput_notifier,
-        .name = "network_throughput",
        .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
        .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
        .type = PM_QOS_MAX,
+        .notifiers = &network_throughput_notifier,
+};
+static struct pm_qos_object network_throughput_pm_qos = {
+        .constraints = &network_tput_constraints,
+        .name = "network_throughput",
 };
@@ -127,17 +124,17 @@ static const struct file_operations pm_qos_power_fops = {
 };
 /* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_object *o)
+static inline int pm_qos_get_value(struct pm_qos_constraints *c)
 {
-        if (plist_head_empty(&o->requests))
+        if (plist_head_empty(&c->list))
-                return o->default_value;
+                return c->default_value;
-        switch (o->type) {
+        switch (c->type) {
        case PM_QOS_MIN:
-                return plist_first(&o->requests)->prio;
+                return plist_first(&c->list)->prio;
        case PM_QOS_MAX:
-                return plist_last(&o->requests)->prio;
+                return plist_last(&c->list)->prio;
        default:
                /* runtime check for not using enum */
@@ -145,69 +142,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        }
 }
-static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+s32 pm_qos_read_value(struct pm_qos_constraints *c)
 {
-        return o->target_value;
+        return c->target_value;
 }
-static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
 {
-        o->target_value = value;
+        c->target_value = value;
 }
-static void update_target(struct pm_qos_object *o, struct plist_node *node,
+/**
-                          int del, int value)
+ * pm_qos_update_target - manages the constraints list and calls the notifiers
+ *  if needed
+ * @c: constraints data struct
+ * @node: request to add to the list, to update or to remove
+ * @action: action to take on the constraints list
+ * @value: value of the request to add or update
+ *
+ * This function returns 1 if the aggregated constraint value has changed, 0
+ *  otherwise.
+ */
+int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
+                         enum pm_qos_req_action action, int value)
 {
        unsigned long flags;
-        int prev_value, curr_value;
+        int prev_value, curr_value, new_value;
        spin_lock_irqsave(&pm_qos_lock, flags);
-        prev_value = pm_qos_get_value(o);
+        prev_value = pm_qos_get_value(c);
-        /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
+        if (value == PM_QOS_DEFAULT_VALUE)
-        if (value != PM_QOS_DEFAULT_VALUE) {
+                new_value = c->default_value;
+        else
+                new_value = value;
+        switch (action) {
+        case PM_QOS_REMOVE_REQ:
+                plist_del(node, &c->list);
+                break;
+        case PM_QOS_UPDATE_REQ:
                /*
                 * to change the list, we atomically remove, reinit
                 * with new value and add, then see if the extremal
                 * changed
                 */
-                plist_del(node, &o->requests);
+                plist_del(node, &c->list);
-                plist_node_init(node, value);
+        case PM_QOS_ADD_REQ:
-                plist_add(node, &o->requests);
+                plist_node_init(node, new_value);
-        } else if (del) {
+                plist_add(node, &c->list);
-                plist_del(node, &o->requests);
+                break;
-        } else {
+        default:
-                plist_add(node, &o->requests);
+                /* no action */
+                ;
        }
-        curr_value = pm_qos_get_value(o);
-        pm_qos_set_value(o, curr_value);
+        curr_value = pm_qos_get_value(c);
+        pm_qos_set_value(c, curr_value);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        if (prev_value != curr_value)
+        if (prev_value != curr_value) {
-                blocking_notifier_call_chain(o->notifiers,
+                blocking_notifier_call_chain(c->notifiers,
                                             (unsigned long)curr_value,
                                             NULL);
-}
+                return 1;
+        } else {
-static int register_pm_qos_misc(struct pm_qos_object *qos)
+                return 0;
-{
-        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
-        qos->pm_qos_power_miscdev.name = qos->name;
-        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
-        return misc_register(&qos->pm_qos_power_miscdev);
-}
-static int find_pm_qos_object_by_minor(int minor)
-{
-        int pm_qos_class;
-        for (pm_qos_class = 0;
-                pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
-                if (minor ==
-                        pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
-                        return pm_qos_class;
        }
-        return -1;
 }
 /**
@@ -218,11 +219,11 @@ static int find_pm_qos_object_by_minor(int minor)
 */
 int pm_qos_request(int pm_qos_class)
 {
-        return pm_qos_read_value(pm_qos_array[pm_qos_class]);
+        return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
-int pm_qos_request_active(struct pm_qos_request_list *req)
+int pm_qos_request_active(struct pm_qos_request *req)
 {
        return req->pm_qos_class != 0;
 }
@@ -230,40 +231,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
 /**
 * pm_qos_add_request - inserts new qos request into the list
- * @dep: pointer to a preallocated handle
+ * @req: pointer to a preallocated handle
 * @pm_qos_class: identifies which list of qos request to use
 * @value: defines the qos request
 *
 * This function inserts a new entry in the pm_qos_class list of requested qos
 * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request_list
+ * for the pm_qos_class of parameters and initializes the pm_qos_request
 * handle.  Caller needs to save this handle for later use in updates and
 * removal.
 */
-void pm_qos_add_request(struct pm_qos_request_list *dep,
+void pm_qos_add_request(struct pm_qos_request *req,
                        int pm_qos_class, s32 value)
 {
-        struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
+        if (!req) /*guard against callers passing in null */
-        int new_value;
+                return;
-        if (pm_qos_request_active(dep)) {
+        if (pm_qos_request_active(req)) {
                WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
                return;
        }
-        if (value == PM_QOS_DEFAULT_VALUE)
+        req->pm_qos_class = pm_qos_class;
-                new_value = o->default_value;
+        pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
-        else
+                             &req->node, PM_QOS_ADD_REQ, value);
-                new_value = value;
-        plist_node_init(&dep->list, new_value);
-        dep->pm_qos_class = pm_qos_class;
-        update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
 /**
 * pm_qos_update_request - modifies an existing qos request
- * @pm_qos_req : handle to list element holding a pm_qos request to use
+ * @req : handle to list element holding a pm_qos request to use
 * @value: defines the qos request
 *
 * Updates an existing qos request for the pm_qos_class of parameters along
@@ -271,56 +268,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
 *
 * Attempts are made to make this code callable on hot code paths.
 */
-void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+void pm_qos_update_request(struct pm_qos_request *req,
                           s32 new_value)
 {
-        s32 temp;
+        if (!req) /*guard against callers passing in null */
-        struct pm_qos_object *o;
-        if (!pm_qos_req) /*guard against callers passing in null */
                return;
-        if (!pm_qos_request_active(pm_qos_req)) {
+        if (!pm_qos_request_active(req)) {
                WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
                return;
        }
-        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        if (new_value != req->node.prio)
+                pm_qos_update_target(
-        if (new_value == PM_QOS_DEFAULT_VALUE)
+                        pm_qos_array[req->pm_qos_class]->constraints,
-                temp = o->default_value;
+                        &req->node, PM_QOS_UPDATE_REQ, new_value);
-        else
-                temp = new_value;
-        if (temp != pm_qos_req->list.prio)
-                update_target(o, &pm_qos_req->list, 0, temp);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
 /**
 * pm_qos_remove_request - modifies an existing qos request
- * @pm_qos_req: handle to request list element
+ * @req: handle to request list element
 *
- * Will remove pm qos request from the list of requests and
+ * Will remove pm qos request from the list of constraints and
 * recompute the current target value for the pm_qos_class.  Call this
 * on slow code paths.
 */
-void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
+void pm_qos_remove_request(struct pm_qos_request *req)
 {
-        struct pm_qos_object *o;
+        if (!req) /*guard against callers passing in null */
-        if (pm_qos_req == NULL)
                return;
                /* silent return to keep pcm code cleaner */
-        if (!pm_qos_request_active(pm_qos_req)) {
+        if (!pm_qos_request_active(req)) {
                WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
                return;
        }
-        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
-        update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+                             &req->node, PM_QOS_REMOVE_REQ,
-        memset(pm_qos_req, 0, sizeof(*pm_qos_req));
+                             PM_QOS_DEFAULT_VALUE);
+        memset(req, 0, sizeof(*req));
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -337,7 +325,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
        int retval;
        retval = blocking_notifier_chain_register(
-                        pm_qos_array[pm_qos_class]->notifiers, notifier);
+                        pm_qos_array[pm_qos_class]->constraints->notifiers,
+                        notifier);
        return retval;
 }
@@ -356,34 +345,57 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
        int retval;
        retval = blocking_notifier_chain_unregister(
-                        pm_qos_array[pm_qos_class]->notifiers, notifier);
+                        pm_qos_array[pm_qos_class]->constraints->notifiers,
+                        notifier);
        return retval;
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+/* User space interface to PM QoS classes via misc devices */
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+        qos->pm_qos_power_miscdev.name = qos->name;
+        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+        return misc_register(&qos->pm_qos_power_miscdev);
+}
+static int find_pm_qos_object_by_minor(int minor)
+{
+        int pm_qos_class;
+        for (pm_qos_class = 0;
+                pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+                if (minor ==
+                        pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+                        return pm_qos_class;
+        }
+        return -1;
+}
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
        long pm_qos_class;
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
-               struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
+                struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
                if (!req)
                        return -ENOMEM;
                pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
                filp->private_data = req;
-                if (filp->private_data)
+                return 0;
-                        return 0;
        }
        return -EPERM;
 }
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
-        struct pm_qos_request_list *req;
+        struct pm_qos_request *req;
        req = filp->private_data;
        pm_qos_remove_request(req);
@@ -398,17 +410,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
 {
        s32 value;
        unsigned long flags;
-        struct pm_qos_object *o;
+        struct pm_qos_request *req = filp->private_data;
-        struct pm_qos_request_list *pm_qos_req = filp->private_data;
-        if (!pm_qos_req)
+        if (!req)
                return -EINVAL;
-        if (!pm_qos_request_active(pm_qos_req))
+        if (!pm_qos_request_active(req))
                return -EINVAL;
-        o = pm_qos_array[pm_qos_req->pm_qos_class];
        spin_lock_irqsave(&pm_qos_lock, flags);
-        value = pm_qos_get_value(o);
+        value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
@@ -418,7 +428,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
        s32 value;
-        struct pm_qos_request_list *pm_qos_req;
+        struct pm_qos_request *req;
        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
@@ -449,8 +459,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                return -EINVAL;
        }
-        pm_qos_req = filp->private_data;
+        req = filp->private_data;
-        pm_qos_update_request(pm_qos_req, value);
+        pm_qos_update_request(req, value);
        return count;
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 06efa54f93d6..1cf88900ec4f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
            PageReserved(page))
                return NULL;
+        if (page_is_guard(page))
+                return NULL;
        return page;
 }
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
            && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
                return NULL;
+        if (page_is_guard(page))
+                return NULL;
        return page;
 }
@@ -1339,6 +1345,9 @@ int hibernate_preallocate_memory(void)
        count += highmem;
        count -= totalreserve_pages;
+        /* Add number of pages required for page keys (s390 only). */
+        size += page_key_additional_pages(saveable);
        /* Compute the maximum number of saveable pages to leave in memory. */
        max_size = (count - (size + PAGES_FOR_IO)) / 2
                        - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1662,6 +1671,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
                buf[j] = memory_bm_next_pfn(bm);
                if (unlikely(buf[j] == BM_END_OF_MAP))
                        break;
+                /* Save page key for data page (s390 only). */
+                page_key_read(buf + j);
        }
 }
@@ -1821,6 +1832,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
                if (unlikely(buf[j] == BM_END_OF_MAP))
                        break;
+                /* Extract and buffer page key for data page (s390 only). */
+                page_key_memorize(buf + j);
                if (memory_bm_pfn_present(bm, buf[j]))
                        memory_bm_set_bit(bm, buf[j]);
                else
@@ -2223,6 +2237,11 @@ int snapshot_write_next(struct snapshot_handle *handle)
                if (error)
                        return error;
+                /* Allocate buffer for page keys. */
+                error = page_key_alloc(nr_copy_pages);
+                if (error)
+                        return error;
        } else if (handle->cur <= nr_meta_pages + 1) {
                error = unpack_orig_pfns(buffer, &copy_bm);
                if (error)
@@ -2243,6 +2262,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
                }
        } else {
                copy_last_highmem_page();
+                /* Restore page key for data page (s390 only). */
+                page_key_write(handle->buffer);
                handle->buffer = get_buffer(&orig_bm, &ca);
                if (IS_ERR(handle->buffer))
                        return PTR_ERR(handle->buffer);
@@ -2264,6 +2285,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
 void snapshot_write_finalize(struct snapshot_handle *handle)
 {
        copy_last_highmem_page();
+        /* Restore page key for data page (s390 only). */
+        page_key_write(handle->buffer);
+        page_key_free();
        /* Free only if we have loaded the image entirely */
        if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad2208f..4fd51beed879 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,6 +12,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/kmod.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
@@ -21,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include <trace/events/power.h>
@@ -40,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
 */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        suspend_ops = ops;
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
 }
 EXPORT_SYMBOL_GPL(suspend_set_ops);
@@ -107,7 +109,8 @@ static int suspend_prepare(void)
        if (!error)
                return 0;
-        suspend_thaw_processes();
+        suspend_stats.failed_freeze++;
+        dpm_save_failed_step(SUSPEND_FREEZE);
        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_SUSPEND);
@@ -315,8 +318,16 @@ int enter_state(suspend_state_t state)
 */
 int pm_suspend(suspend_state_t state)
 {
-        if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+        int ret;
-                return enter_state(state);
+        if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
+                ret = enter_state(state);
+                if (ret) {
+                        suspend_stats.fail++;
+                        dpm_save_failed_errno(ret);
+                } else
+                        suspend_stats.success++;
+                return ret;
+        }
        return -EINVAL;
 }
 EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c97c3a0eee3..3739ecced085 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
 #include <linux/bitops.h>
 #include <linux/genhd.h>
 #include <linux/device.h>
-#include <linux/buffer_head.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
@@ -27,6 +26,10 @@
 #include <linux/slab.h>
 #include <linux/lzo.h>
 #include <linux/vmalloc.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
 #include "power.h"
@@ -43,8 +46,7 @@
 *      allocated and populated one at a time, so we only need one memory
 *      page to set up the entire structure.
 *
- *      During resume we also only need to use one swap_map_page structure
+ *      During resume we pick up all swap_map_page structures into a list.
- *      at a time.
 */
 #define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(sector_t) - 1)
@@ -54,6 +56,11 @@ struct swap_map_page {
        sector_t next_swap;
 };
+struct swap_map_page_list {
+        struct swap_map_page *map;
+        struct swap_map_page_list *next;
+};
 /**
 *      The swap_map_handle structure is used for handling swap in
 *      a file-alike way
@@ -61,13 +68,18 @@ struct swap_map_page {
 struct swap_map_handle {
        struct swap_map_page *cur;
+        struct swap_map_page_list *maps;
        sector_t cur_swap;
        sector_t first_sector;
        unsigned int k;
+        unsigned long nr_free_pages, written;
+        u32 crc32;
 };
 struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
+        char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
+                      sizeof(u32)];
+        u32     crc32;
        sector_t image;
        unsigned int flags;     /* Flags to pass to the "boot" kernel */
        char    orig_sig[10];
@@ -199,6 +211,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
                memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
                swsusp_header->image = handle->first_sector;
                swsusp_header->flags = flags;
+                if (flags & SF_CRC32_MODE)
+                        swsusp_header->crc32 = handle->crc32;
                error = hib_bio_write_page(swsusp_resume_block,
                                        swsusp_header, NULL);
        } else {
@@ -245,6 +259,7 @@ static int swsusp_swap_check(void)
 static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
        void *src;
+        int ret;
        if (!offset)
                return -ENOSPC;
@@ -254,9 +269,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
                if (src) {
                        copy_page(src, buf);
                } else {
-                        WARN_ON_ONCE(1);
+                        ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
-                        bio_chain = NULL;       /* Go synchronous */
+                        if (ret)
-                        src = buf;
+                                return ret;
+                        src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+                        if (src) {
+                                copy_page(src, buf);
+                        } else {
+                                WARN_ON_ONCE(1);
+                                bio_chain = NULL;       /* Go synchronous */
+                                src = buf;
+                        }
                }
        } else {
                src = buf;
@@ -293,6 +316,8 @@ static int get_swap_writer(struct swap_map_handle *handle)
                goto err_rel;
        }
        handle->k = 0;
+        handle->nr_free_pages = nr_free_pages() >> 1;
+        handle->written = 0;
        handle->first_sector = handle->cur_swap;
        return 0;
 err_rel:
@@ -316,20 +341,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                return error;
        handle->cur->entries[handle->k++] = offset;
        if (handle->k >= MAP_PAGE_ENTRIES) {
-                error = hib_wait_on_bio_chain(bio_chain);
-                if (error)
-                        goto out;
                offset = alloc_swapdev_block(root_swap);
                if (!offset)
                        return -ENOSPC;
                handle->cur->next_swap = offset;
-                error = write_page(handle->cur, handle->cur_swap, NULL);
+                error = write_page(handle->cur, handle->cur_swap, bio_chain);
                if (error)
                        goto out;
                clear_page(handle->cur);
                handle->cur_swap = offset;
                handle->k = 0;
        }
+        if (bio_chain && ++handle->written > handle->nr_free_pages) {
+                error = hib_wait_on_bio_chain(bio_chain);
+                if (error)
+                        goto out;
+                handle->written = 0;
+        }
 out:
        return error;
 }
@@ -372,6 +400,13 @@ static int swap_writer_finish(struct swap_map_handle *handle,
                                     LZO_HEADER, PAGE_SIZE)
 #define LZO_CMP_SIZE    (LZO_CMP_PAGES * PAGE_SIZE)
+/* Maximum number of threads for compression/decompression. */
+#define LZO_THREADS     3
+/* Maximum number of pages for read buffering. */
+#define LZO_READ_PAGES  (MAP_PAGE_ENTRIES * 8)
 /**
 *      save_image - save the suspend image data
 */
@@ -419,6 +454,92 @@ static int save_image(struct swap_map_handle *handle,
        return ret;
 }
+/**
+ * Structure used for CRC32.
+ */
+struct crc_data {
+        struct task_struct *thr;                  /* thread */
+        atomic_t ready;                           /* ready to start flag */
+        atomic_t stop;                            /* ready to stop flag */
+        unsigned run_threads;                     /* nr current threads */
+        wait_queue_head_t go;                     /* start crc update */
+        wait_queue_head_t done;                   /* crc update done */
+        u32 *crc32;                               /* points to handle's crc32 */
+        size_t *unc_len[LZO_THREADS];             /* uncompressed lengths */
+        unsigned char *unc[LZO_THREADS];          /* uncompressed data */
+};
+/**
+ * CRC32 update function that runs in its own thread.
+ */
+static int crc32_threadfn(void *data)
+{
+        struct crc_data *d = data;
+        unsigned i;
+        while (1) {
+                wait_event(d->go, atomic_read(&d->ready) ||
+                                  kthread_should_stop());
+                if (kthread_should_stop()) {
+                        d->thr = NULL;
+                        atomic_set(&d->stop, 1);
+                        wake_up(&d->done);
+                        break;
+                }
+                atomic_set(&d->ready, 0);
+                for (i = 0; i < d->run_threads; i++)
+                        *d->crc32 = crc32_le(*d->crc32,
+                                             d->unc[i], *d->unc_len[i]);
+                atomic_set(&d->stop, 1);
+                wake_up(&d->done);
+        }
+        return 0;
+}
+/**
+ * Structure used for LZO data compression.
+ */
+struct cmp_data {
+        struct task_struct *thr;                  /* thread */
+        atomic_t ready;                           /* ready to start flag */
+        atomic_t stop;                            /* ready to stop flag */
+        int ret;                                  /* return code */
+        wait_queue_head_t go;                     /* start compression */
+        wait_queue_head_t done;                   /* compression done */
+        size_t unc_len;                           /* uncompressed length */
+        size_t cmp_len;                           /* compressed length */
+        unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
+        unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
+        unsigned char wrk[LZO1X_1_MEM_COMPRESS];  /* compression workspace */
+};
+/**
+ * Compression function that runs in its own thread.
+ */
+static int lzo_compress_threadfn(void *data)
+{
+        struct cmp_data *d = data;
+        while (1) {
+                wait_event(d->go, atomic_read(&d->ready) ||
+                                  kthread_should_stop());
+                if (kthread_should_stop()) {
+                        d->thr = NULL;
+                        d->ret = -1;
+                        atomic_set(&d->stop, 1);
+                        wake_up(&d->done);
+                        break;
+                }
+                atomic_set(&d->ready, 0);
+                d->ret = lzo1x_1_compress(d->unc, d->unc_len,
+                                          d->cmp + LZO_HEADER, &d->cmp_len,
+                                          d->wrk);
+                atomic_set(&d->stop, 1);
+                wake_up(&d->done);
+        }
+        return 0;
+}
 /**
 * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -437,42 +558,93 @@ static int save_image_lzo(struct swap_map_handle *handle,
        struct bio *bio;
        struct timeval start;
        struct timeval stop;
-        size_t off, unc_len, cmp_len;
+        size_t off;
-        unsigned char *unc, *cmp, *wrk, *page;
+        unsigned thr, run_threads, nr_threads;
+        unsigned char *page = NULL;
+        struct cmp_data *data = NULL;
+        struct crc_data *crc = NULL;
+        /*
+         * We'll limit the number of threads for compression to limit memory
+         * footprint.
+         */
+        nr_threads = num_online_cpus() - 1;
+        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
        if (!page) {
                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out_clean;
        }
-        wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
+        data = vmalloc(sizeof(*data) * nr_threads);
-        if (!wrk) {
+        if (!data) {
-                printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
+                printk(KERN_ERR "PM: Failed to allocate LZO data\n");
-                free_page((unsigned long)page);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto out_clean;
        }
+        for (thr = 0; thr < nr_threads; thr++)
+                memset(&data[thr], 0, offsetof(struct cmp_data, go));
-        unc = vmalloc(LZO_UNC_SIZE);
+        crc = kmalloc(sizeof(*crc), GFP_KERNEL);
-        if (!unc) {
+        if (!crc) {
-                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                printk(KERN_ERR "PM: Failed to allocate crc\n");
-                vfree(wrk);
+                ret = -ENOMEM;
-                free_page((unsigned long)page);
+                goto out_clean;
-                return -ENOMEM;
+        }
+        memset(crc, 0, offsetof(struct crc_data, go));
+        /*
+         * Start the compression threads.
+         */
+        for (thr = 0; thr < nr_threads; thr++) {
+                init_waitqueue_head(&data[thr].go);
+                init_waitqueue_head(&data[thr].done);
+                data[thr].thr = kthread_run(lzo_compress_threadfn,
+                                            &data[thr],
+                                            "image_compress/%u", thr);
+                if (IS_ERR(data[thr].thr)) {
+                        data[thr].thr = NULL;
+                        printk(KERN_ERR
+                               "PM: Cannot start compression threads\n");
+                        ret = -ENOMEM;
+                        goto out_clean;
+                }
        }
-        cmp = vmalloc(LZO_CMP_SIZE);
+        /*
-        if (!cmp) {
+         * Adjust number of free pages after all allocations have been done.
-                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+         * We don't want to run out of pages when writing.
-                vfree(unc);
+         */
-                vfree(wrk);
+        handle->nr_free_pages = nr_free_pages() >> 1;
-                free_page((unsigned long)page);
-                return -ENOMEM;
+        /*
+         * Start the CRC32 thread.
+         */
+        init_waitqueue_head(&crc->go);
+        init_waitqueue_head(&crc->done);
+        handle->crc32 = 0;
+        crc->crc32 = &handle->crc32;
+        for (thr = 0; thr < nr_threads; thr++) {
+                crc->unc[thr] = data[thr].unc;
+                crc->unc_len[thr] = &data[thr].unc_len;
+        }
+        crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+        if (IS_ERR(crc->thr)) {
+                crc->thr = NULL;
+                printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+                ret = -ENOMEM;
+                goto out_clean;
        }
        printk(KERN_INFO
+                "PM: Using %u thread(s) for compression.\n"
                "PM: Compressing and saving image data (%u pages) ...     ",
-                nr_to_write);
+                nr_threads, nr_to_write);
        m = nr_to_write / 100;
        if (!m)
                m = 1;
@@ -480,55 +652,83 @@ static int save_image_lzo(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
        for (;;) {
-                for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+                for (thr = 0; thr < nr_threads; thr++) {
-                        ret = snapshot_read_next(snapshot);
+                        for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
-                        if (ret < 0)
+                                ret = snapshot_read_next(snapshot);
-                                goto out_finish;
+                                if (ret < 0)
+                                        goto out_finish;
-                        if (!ret)
+                                if (!ret)
+                                        break;
+                                memcpy(data[thr].unc + off,
+                                       data_of(*snapshot), PAGE_SIZE);
+                                if (!(nr_pages % m))
+                                        printk(KERN_CONT "\b\b\b\b%3d%%",
+                                               nr_pages / m);
+                                nr_pages++;
+                        }
+                        if (!off)
                                break;
-                        memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
+                        data[thr].unc_len = off;
-                        if (!(nr_pages % m))
+                        atomic_set(&data[thr].ready, 1);
-                                printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        wake_up(&data[thr].go);
-                        nr_pages++;
                }
-                if (!off)
+                if (!thr)
                        break;
-                unc_len = off;
+                crc->run_threads = thr;
-                ret = lzo1x_1_compress(unc, unc_len,
+                atomic_set(&crc->ready, 1);
-                                       cmp + LZO_HEADER, &cmp_len, wrk);
+                wake_up(&crc->go);
-                if (ret < 0) {
-                        printk(KERN_ERR "PM: LZO compression failed\n");
-                        break;
-                }
-                if (unlikely(!cmp_len ||
+                for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
-                             cmp_len > lzo1x_worst_compress(unc_len))) {
+                        wait_event(data[thr].done,
-                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                                   atomic_read(&data[thr].stop));
-                        ret = -1;
+                        atomic_set(&data[thr].stop, 0);
-                        break;
-                }
-                *(size_t *)cmp = cmp_len;
+                        ret = data[thr].ret;
-                /*
+                        if (ret < 0) {
-                 * Given we are writing one page at a time to disk, we copy
+                                printk(KERN_ERR "PM: LZO compression failed\n");
-                 * that much from the buffer, although the last bit will likely
+                                goto out_finish;
-                 * be smaller than full page. This is OK - we saved the length
+                        }
-                 * of the compressed data, so any garbage at the end will be
-                 * discarded when we read it.
-                 */
-                for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
-                        memcpy(page, cmp + off, PAGE_SIZE);
-                        ret = swap_write_page(handle, page, &bio);
+                        if (unlikely(!data[thr].cmp_len ||
-                        if (ret)
+                                     data[thr].cmp_len >
+                                     lzo1x_worst_compress(data[thr].unc_len))) {
+                                printk(KERN_ERR
+                                       "PM: Invalid LZO compressed length\n");
+                                ret = -1;
                                goto out_finish;
+                        }
+                        *(size_t *)data[thr].cmp = data[thr].cmp_len;
+                        /*
+                         * Given we are writing one page at a time to disk, we
+                         * copy that much from the buffer, although the last
+                         * bit will likely be smaller than full page. This is
+                         * OK - we saved the length of the compressed data, so
+                         * any garbage at the end will be discarded when we
+                         * read it.
+                         */
+                        for (off = 0;
+                             off < LZO_HEADER + data[thr].cmp_len;
+                             off += PAGE_SIZE) {
+                                memcpy(page, data[thr].cmp + off, PAGE_SIZE);
+                                ret = swap_write_page(handle, page, &bio);
+                                if (ret)
+                                        goto out_finish;
+                        }
                }
+                wait_event(crc->done, atomic_read(&crc->stop));
+                atomic_set(&crc->stop, 0);
        }
 out_finish:
@@ -536,16 +736,25 @@ out_finish:
        do_gettimeofday(&stop);
        if (!ret)
                ret = err2;
-        if (!ret)
+        if (!ret) {
                printk(KERN_CONT "\b\b\b\bdone\n");
-        else
+        } else {
                printk(KERN_CONT "\n");
+        }
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+out_clean:
-        vfree(cmp);
+        if (crc) {
-        vfree(unc);
+                if (crc->thr)
-        vfree(wrk);
+                        kthread_stop(crc->thr);
-        free_page((unsigned long)page);
+                kfree(crc);
+        }
+        if (data) {
+                for (thr = 0; thr < nr_threads; thr++)
+                        if (data[thr].thr)
+                                kthread_stop(data[thr].thr);
+                vfree(data);
+        }
+        if (page) free_page((unsigned long)page);
        return ret;
 }
@@ -625,8 +834,15 @@ out_finish:
 static void release_swap_reader(struct swap_map_handle *handle)
 {
-        if (handle->cur)
+        struct swap_map_page_list *tmp;
-                free_page((unsigned long)handle->cur);
+        while (handle->maps) {
+                if (handle->maps->map)
+                        free_page((unsigned long)handle->maps->map);
+                tmp = handle->maps;
+                handle->maps = handle->maps->next;
+                kfree(tmp);
+        }
        handle->cur = NULL;
 }
@@ -634,22 +850,46 @@ static int get_swap_reader(struct swap_map_handle *handle,
                unsigned int *flags_p)
 {
        int error;
+        struct swap_map_page_list *tmp, *last;
+        sector_t offset;
        *flags_p = swsusp_header->flags;
        if (!swsusp_header->image) /* how can this happen? */
                return -EINVAL;
-        handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
+        handle->cur = NULL;
-        if (!handle->cur)
+        last = handle->maps = NULL;
-                return -ENOMEM;
+        offset = swsusp_header->image;
+        while (offset) {
+                tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+                if (!tmp) {
+                        release_swap_reader(handle);
+                        return -ENOMEM;
+                }
+                memset(tmp, 0, sizeof(*tmp));
+                if (!handle->maps)
+                        handle->maps = tmp;
+                if (last)
+                        last->next = tmp;
+                last = tmp;
+                tmp->map = (struct swap_map_page *)
+                           __get_free_page(__GFP_WAIT | __GFP_HIGH);
+                if (!tmp->map) {
+                        release_swap_reader(handle);
+                        return -ENOMEM;
+                }
-        error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
+                error = hib_bio_read_page(offset, tmp->map, NULL);
-        if (error) {
+                if (error) {
-                release_swap_reader(handle);
+                        release_swap_reader(handle);
-                return error;
+                        return error;
+                }
+                offset = tmp->map->next_swap;
        }
        handle->k = 0;
+        handle->cur = handle->maps->map;
        return 0;
 }
@@ -658,6 +898,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 {
        sector_t offset;
        int error;
+        struct swap_map_page_list *tmp;
        if (!handle->cur)
                return -EINVAL;
@@ -668,13 +909,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
        if (error)
                return error;
        if (++handle->k >= MAP_PAGE_ENTRIES) {
-                error = hib_wait_on_bio_chain(bio_chain);
                handle->k = 0;
-                offset = handle->cur->next_swap;
+                free_page((unsigned long)handle->maps->map);
-                if (!offset)
+                tmp = handle->maps;
+                handle->maps = handle->maps->next;
+                kfree(tmp);
+                if (!handle->maps)
                        release_swap_reader(handle);
-                else if (!error)
+                else
-                        error = hib_bio_read_page(offset, handle->cur, NULL);
+                        handle->cur = handle->maps->map;
        }
        return error;
 }
@@ -697,7 +940,7 @@ static int load_image(struct swap_map_handle *handle,
                      unsigned int nr_to_read)
 {
        unsigned int m;
-        int error = 0;
+        int ret = 0;
        struct timeval start;
        struct timeval stop;
        struct bio *bio;
@@ -713,15 +956,15 @@ static int load_image(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
        for ( ; ; ) {
-                error = snapshot_write_next(snapshot);
+                ret = snapshot_write_next(snapshot);
-                if (error <= 0)
+                if (ret <= 0)
                        break;
-                error = swap_read_page(handle, data_of(*snapshot), &bio);
+                ret = swap_read_page(handle, data_of(*snapshot), &bio);
-                if (error)
+                if (ret)
                        break;
                if (snapshot->sync_read)
-                        error = hib_wait_on_bio_chain(&bio);
+                        ret = hib_wait_on_bio_chain(&bio);
-                if (error)
+                if (ret)
                        break;
                if (!(nr_pages % m))
                        printk("\b\b\b\b%3d%%", nr_pages / m);
@@ -729,17 +972,61 @@ static int load_image(struct swap_map_handle *handle,
        }
        err2 = hib_wait_on_bio_chain(&bio);
        do_gettimeofday(&stop);
-        if (!error)
+        if (!ret)
-                error = err2;
+                ret = err2;
-        if (!error) {
+        if (!ret) {
                printk("\b\b\b\bdone\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
-                        error = -ENODATA;
+                        ret = -ENODATA;
        } else
                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
-        return error;
+        return ret;
+}
+/**
+ * Structure used for LZO data decompression.
+ */
+struct dec_data {
+        struct task_struct *thr;                  /* thread */
+        atomic_t ready;                           /* ready to start flag */
+        atomic_t stop;                            /* ready to stop flag */
+        int ret;                                  /* return code */
+        wait_queue_head_t go;                     /* start decompression */
+        wait_queue_head_t done;                   /* decompression done */
+        size_t unc_len;                           /* uncompressed length */
+        size_t cmp_len;                           /* compressed length */
+        unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
+        unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
+};
+/**
+ * Deompression function that runs in its own thread.
+ */
+static int lzo_decompress_threadfn(void *data)
+{
+        struct dec_data *d = data;
+        while (1) {
+                wait_event(d->go, atomic_read(&d->ready) ||
+                                  kthread_should_stop());
+                if (kthread_should_stop()) {
+                        d->thr = NULL;
+                        d->ret = -1;
+                        atomic_set(&d->stop, 1);
+                        wake_up(&d->done);
+                        break;
+                }
+                atomic_set(&d->ready, 0);
+                d->unc_len = LZO_UNC_SIZE;
+                d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
+                                               d->unc, &d->unc_len);
+                atomic_set(&d->stop, 1);
+                wake_up(&d->done);
+        }
+        return 0;
 }
 /**
@@ -753,50 +1040,120 @@ static int load_image_lzo(struct swap_map_handle *handle,
                          unsigned int nr_to_read)
 {
        unsigned int m;
-        int error = 0;
+        int ret = 0;
+        int eof = 0;
        struct bio *bio;
        struct timeval start;
        struct timeval stop;
        unsigned nr_pages;
-        size_t i, off, unc_len, cmp_len;
+        size_t off;
-        unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
+        unsigned i, thr, run_threads, nr_threads;
+        unsigned ring = 0, pg = 0, ring_size = 0,
-        for (i = 0; i < LZO_CMP_PAGES; i++) {
+                 have = 0, want, need, asked = 0;
-                page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        unsigned long read_pages;
-                if (!page[i]) {
+        unsigned char **page = NULL;
-                        printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+        struct dec_data *data = NULL;
+        struct crc_data *crc = NULL;
+        /*
+         * We'll limit the number of threads for decompression to limit memory
+         * footprint.
+         */
+        nr_threads = num_online_cpus() - 1;
+        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+        page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+        if (!page) {
+                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
-                        while (i)
+        data = vmalloc(sizeof(*data) * nr_threads);
-                                free_page((unsigned long)page[--i]);
+        if (!data) {
+                printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
+        for (thr = 0; thr < nr_threads; thr++)
+                memset(&data[thr], 0, offsetof(struct dec_data, go));
-                        return -ENOMEM;
+        crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+        if (!crc) {
+                printk(KERN_ERR "PM: Failed to allocate crc\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
+        memset(crc, 0, offsetof(struct crc_data, go));
+        /*
+         * Start the decompression threads.
+         */
+        for (thr = 0; thr < nr_threads; thr++) {
+                init_waitqueue_head(&data[thr].go);
+                init_waitqueue_head(&data[thr].done);
+                data[thr].thr = kthread_run(lzo_decompress_threadfn,
+                                            &data[thr],
+                                            "image_decompress/%u", thr);
+                if (IS_ERR(data[thr].thr)) {
+                        data[thr].thr = NULL;
+                        printk(KERN_ERR
+                               "PM: Cannot start decompression threads\n");
+                        ret = -ENOMEM;
+                        goto out_clean;
                }
        }
-        unc = vmalloc(LZO_UNC_SIZE);
+        /*
-        if (!unc) {
+         * Start the CRC32 thread.
-                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+         */
+        init_waitqueue_head(&crc->go);
-                for (i = 0; i < LZO_CMP_PAGES; i++)
+        init_waitqueue_head(&crc->done);
-                        free_page((unsigned long)page[i]);
+        handle->crc32 = 0;
-                return -ENOMEM;
+        crc->crc32 = &handle->crc32;
+        for (thr = 0; thr < nr_threads; thr++) {
+                crc->unc[thr] = data[thr].unc;
+                crc->unc_len[thr] = &data[thr].unc_len;
        }
-        cmp = vmalloc(LZO_CMP_SIZE);
+        crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
-        if (!cmp) {
+        if (IS_ERR(crc->thr)) {
-                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                crc->thr = NULL;
+                printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
-                vfree(unc);
+        /*
-                for (i = 0; i < LZO_CMP_PAGES; i++)
+         * Adjust number of pages for read buffering, in case we are short.
-                        free_page((unsigned long)page[i]);
+         */
+        read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
+        read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
-                return -ENOMEM;
+        for (i = 0; i < read_pages; i++) {
+                page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
+                                                  __GFP_WAIT | __GFP_HIGH :
+                                                  __GFP_WAIT);
+                if (!page[i]) {
+                        if (i < LZO_CMP_PAGES) {
+                                ring_size = i;
+                                printk(KERN_ERR
+                                       "PM: Failed to allocate LZO pages\n");
+                                ret = -ENOMEM;
+                                goto out_clean;
+                        } else {
+                                break;
+                        }
+                }
        }
+        want = ring_size = i;
        printk(KERN_INFO
+                "PM: Using %u thread(s) for decompression.\n"
                "PM: Loading and decompressing image data (%u pages) ...     ",
-                nr_to_read);
+                nr_threads, nr_to_read);
        m = nr_to_read / 100;
        if (!m)
                m = 1;
@@ -804,85 +1161,189 @@ static int load_image_lzo(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
-        error = snapshot_write_next(snapshot);
+        ret = snapshot_write_next(snapshot);
-        if (error <= 0)
+        if (ret <= 0)
                goto out_finish;
-        for (;;) {
+        for(;;) {
-                error = swap_read_page(handle, page[0], NULL); /* sync */
+                for (i = 0; !eof && i < want; i++) {
-                if (error)
+                        ret = swap_read_page(handle, page[ring], &bio);
-                        break;
+                        if (ret) {
+                                /*
-                cmp_len = *(size_t *)page[0];
+                                 * On real read error, finish. On end of data,
-                if (unlikely(!cmp_len ||
+                                 * set EOF flag and just exit the read loop.
-                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
+                                 */
-                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                                if (handle->cur &&
-                        error = -1;
+                                    handle->cur->entries[handle->k]) {
-                        break;
+                                        goto out_finish;
+                                } else {
+                                        eof = 1;
+                                        break;
+                                }
+                        }
+                        if (++ring >= ring_size)
+                                ring = 0;
                }
+                asked += i;
+                want -= i;
-                for (off = PAGE_SIZE, i = 1;
+                /*
-                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                 * We are out of data, wait for some more.
-                        error = swap_read_page(handle, page[i], &bio);
+                 */
-                        if (error)
+                if (!have) {
+                        if (!asked)
+                                break;
+                        ret = hib_wait_on_bio_chain(&bio);
+                        if (ret)
                                goto out_finish;
+                        have += asked;
+                        asked = 0;
+                        if (eof)
+                                eof = 2;
                }
-                error = hib_wait_on_bio_chain(&bio); /* need all data now */
+                if (crc->run_threads) {
-                if (error)
+                        wait_event(crc->done, atomic_read(&crc->stop));
-                        goto out_finish;
+                        atomic_set(&crc->stop, 0);
+                        crc->run_threads = 0;
-                for (off = 0, i = 0;
-                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-                        memcpy(cmp + off, page[i], PAGE_SIZE);
                }
-                unc_len = LZO_UNC_SIZE;
+                for (thr = 0; have && thr < nr_threads; thr++) {
-                error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
+                        data[thr].cmp_len = *(size_t *)page[pg];
-                                              unc, &unc_len);
+                        if (unlikely(!data[thr].cmp_len ||
-                if (error < 0) {
+                                     data[thr].cmp_len >
-                        printk(KERN_ERR "PM: LZO decompression failed\n");
+                                     lzo1x_worst_compress(LZO_UNC_SIZE))) {
-                        break;
+                                printk(KERN_ERR
+                                       "PM: Invalid LZO compressed length\n");
+                                ret = -1;
+                                goto out_finish;
+                        }
+                        need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+                                            PAGE_SIZE);
+                        if (need > have) {
+                                if (eof > 1) {
+                                        ret = -1;
+                                        goto out_finish;
+                                }
+                                break;
+                        }
+                        for (off = 0;
+                             off < LZO_HEADER + data[thr].cmp_len;
+                             off += PAGE_SIZE) {
+                                memcpy(data[thr].cmp + off,
+                                       page[pg], PAGE_SIZE);
+                                have--;
+                                want++;
+                                if (++pg >= ring_size)
+                                        pg = 0;
+                        }
+                        atomic_set(&data[thr].ready, 1);
+                        wake_up(&data[thr].go);
                }
-                if (unlikely(!unc_len ||
+                /*
-                             unc_len > LZO_UNC_SIZE ||
+                 * Wait for more data while we are decompressing.
-                             unc_len & (PAGE_SIZE - 1))) {
+                 */
-                        printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
+                if (have < LZO_CMP_PAGES && asked) {
-                        error = -1;
+                        ret = hib_wait_on_bio_chain(&bio);
-                        break;
+                        if (ret)
+                                goto out_finish;
+                        have += asked;
+                        asked = 0;
+                        if (eof)
+                                eof = 2;
                }
-                for (off = 0; off < unc_len; off += PAGE_SIZE) {
+                for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
-                        memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
+                        wait_event(data[thr].done,
+                                   atomic_read(&data[thr].stop));
+                        atomic_set(&data[thr].stop, 0);
+                        ret = data[thr].ret;
-                        if (!(nr_pages % m))
+                        if (ret < 0) {
-                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                                printk(KERN_ERR
-                        nr_pages++;
+                                       "PM: LZO decompression failed\n");
+                                goto out_finish;
+                        }
-                        error = snapshot_write_next(snapshot);
+                        if (unlikely(!data[thr].unc_len ||
-                        if (error <= 0)
+                                     data[thr].unc_len > LZO_UNC_SIZE ||
+                                     data[thr].unc_len & (PAGE_SIZE - 1))) {
+                                printk(KERN_ERR
+                                       "PM: Invalid LZO uncompressed length\n");
+                                ret = -1;
                                goto out_finish;
+                        }
+                        for (off = 0;
+                             off < data[thr].unc_len; off += PAGE_SIZE) {
+                                memcpy(data_of(*snapshot),
+                                       data[thr].unc + off, PAGE_SIZE);
+                                if (!(nr_pages % m))
+                                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                                nr_pages++;
+                                ret = snapshot_write_next(snapshot);
+                                if (ret <= 0) {
+                                        crc->run_threads = thr + 1;
+                                        atomic_set(&crc->ready, 1);
+                                        wake_up(&crc->go);
+                                        goto out_finish;
+                                }
+                        }
                }
+                crc->run_threads = thr;
+                atomic_set(&crc->ready, 1);
+                wake_up(&crc->go);
        }
 out_finish:
+        if (crc->run_threads) {
+                wait_event(crc->done, atomic_read(&crc->stop));
+                atomic_set(&crc->stop, 0);
+        }
        do_gettimeofday(&stop);
-        if (!error) {
+        if (!ret) {
                printk("\b\b\b\bdone\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
-                        error = -ENODATA;
+                        ret = -ENODATA;
+                if (!ret) {
+                        if (swsusp_header->flags & SF_CRC32_MODE) {
+                                if(handle->crc32 != swsusp_header->crc32) {
+                                        printk(KERN_ERR
+                                               "PM: Invalid image CRC32!\n");
+                                        ret = -ENODATA;
+                                }
+                        }
+                }
        } else
                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+out_clean:
-        vfree(cmp);
+        for (i = 0; i < ring_size; i++)
-        vfree(unc);
-        for (i = 0; i < LZO_CMP_PAGES; i++)
                free_page((unsigned long)page[i]);
+        if (crc) {
+                if (crc->thr)
+                        kthread_stop(crc->thr);
+                kfree(crc);
+        }
+        if (data) {
+                for (thr = 0; thr < nr_threads; thr++)
+                        if (data[thr].thr)
+                                kthread_stop(data[thr].thr);
+                vfree(data);
+        }
+        if (page) vfree(page);
-        return error;
+        return ret;
 }
 /**
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 42ddbc6f0de6..6b1ab7a88522 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,6 +12,7 @@
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
+#include <linux/kmod.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
@@ -20,6 +21,7 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
+#include <linux/compat.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
@@ -29,28 +31,6 @@
 #include "power.h"
-/*
- * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
- * will be removed in the future.  They are only preserved here for
- * compatibility with existing userland utilities.
- */
-#define SNAPSHOT_SET_SWAP_FILE  _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
-#define SNAPSHOT_PMOPS          _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
-#define PMOPS_PREPARE   1
-#define PMOPS_ENTER     2
-#define PMOPS_FINISH    3
-/*
- * NOTE: The following ioctl definitions are wrong and have been replaced with
- * correct ones.  They are only preserved here for compatibility with existing
- * userland utilities and will be removed in the future.
- */
-#define SNAPSHOT_ATOMIC_SNAPSHOT        _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
-#define SNAPSHOT_SET_IMAGE_SIZE         _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
-#define SNAPSHOT_AVAIL_SWAP             _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
-#define SNAPSHOT_GET_SWAP_PAGE          _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
 #define SNAPSHOT_MINOR  231
@@ -70,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        struct snapshot_data *data;
        int error;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
                error = -EBUSY;
@@ -122,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        data->platform_support = 0;
 Unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return error;
 }
@@ -131,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 {
        struct snapshot_data *data;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        swsusp_free();
        free_basic_memory_bitmaps();
@@ -145,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
        atomic_inc(&snapshot_device_available);
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return 0;
 }
@@ -157,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
        ssize_t res;
        loff_t pg_offp = *offp & ~PAGE_MASK;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        data = filp->private_data;
        if (!data->ready) {
@@ -178,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
                *offp += res;
 Unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return res;
 }
@@ -190,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        ssize_t res;
        loff_t pg_offp = *offp & ~PAGE_MASK;
-        mutex_lock(&pm_mutex);
+        lock_system_sleep();
        data = filp->private_data;
@@ -207,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        if (res > 0)
                *offp += res;
 unlock:
-        mutex_unlock(&pm_mutex);
+        unlock_system_sleep();
        return res;
 }
-static void snapshot_deprecated_ioctl(unsigned int cmd)
-{
-        if (printk_ratelimit())
-                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
-                                "be removed soon, update your suspend-to-disk "
-                                "utilities\n",
-                                __builtin_return_address(0), cmd);
-}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -256,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        break;
                error = freeze_processes();
-                if (error) {
+                if (error)
-                        thaw_processes();
                        usermodehelper_enable();
-                }
+                else
-                if (!error)
                        data->frozen = 1;
                break;
@@ -273,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->frozen = 0;
                break;
-        case SNAPSHOT_ATOMIC_SNAPSHOT:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
@@ -282,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                }
                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
-                if (!error)
+                if (!error) {
                        error = put_user(in_suspend, (int __user *)arg);
-                if (!error)
+                        if (!error && !freezer_test_done)
-                        data->ready = 1;
+                                data->ready = 1;
+                        if (freezer_test_done) {
+                                freezer_test_done = false;
+                                thaw_processes();
+                        }
+                }
                break;
        case SNAPSHOT_ATOMIC_RESTORE:
@@ -304,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->ready = 0;
                break;
-        case SNAPSHOT_SET_IMAGE_SIZE:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -320,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_AVAIL_SWAP:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_GET_SWAP_PAGE:
-                snapshot_deprecated_ioctl(cmd);
        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
@@ -352,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                free_all_swap_pages(data->swap);
                break;
-        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
-                snapshot_deprecated_ioctl(cmd);
-                if (!swsusp_swap_in_use()) {
-                        /*
-                         * User space encodes device types as two-byte values,
-                         * so we need to recode them
-                         */
-                        if (old_decode_dev(arg)) {
-                                data->swap = swap_type_of(old_decode_dev(arg),
-                                                        0, NULL);
-                                if (data->swap < 0)
-                                        error = -ENODEV;
-                        } else {
-                                data->swap = -1;
-                                error = -EINVAL;
-                        }
-                } else {
-                        error = -EPERM;
-                }
-                break;
        case SNAPSHOT_S2RAM:
                if (!data->frozen) {
                        error = -EPERM;
@@ -395,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        error = hibernation_platform_enter();
                break;
-        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
-                snapshot_deprecated_ioctl(cmd);
-                error = -EINVAL;
-                switch (arg) {
-                case PMOPS_PREPARE:
-                        data->platform_support = 1;
-                        error = 0;
-                        break;
-                case PMOPS_ENTER:
-                        if (data->platform_support)
-                                error = hibernation_platform_enter();
-                        break;
-                case PMOPS_FINISH:
-                        if (data->platform_support)
-                                error = 0;
-                        break;
-                default:
-                        printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
-                }
-                break;
        case SNAPSHOT_SET_SWAP_AREA:
                if (swsusp_swap_in_use()) {
                        error = -EPERM;
@@ -463,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        return error;
 }
+#ifdef CONFIG_COMPAT
+struct compat_resume_swap_area {
+        compat_loff_t offset;
+        u32 dev;
+} __packed;
+static long
+snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
+        switch (cmd) {
+        case SNAPSHOT_GET_IMAGE_SIZE:
+        case SNAPSHOT_AVAIL_SWAP_SIZE:
+        case SNAPSHOT_ALLOC_SWAP_PAGE: {
+                compat_loff_t __user *uoffset = compat_ptr(arg);
+                loff_t offset;
+                mm_segment_t old_fs;
+                int err;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
+                set_fs(old_fs);
+                if (!err && put_user(offset, uoffset))
+                        err = -EFAULT;
+                return err;
+        }
+        case SNAPSHOT_CREATE_IMAGE:
+                return snapshot_ioctl(file, cmd,
+                                      (unsigned long) compat_ptr(arg));
+        case SNAPSHOT_SET_SWAP_AREA: {
+                struct compat_resume_swap_area __user *u_swap_area =
+                        compat_ptr(arg);
+                struct resume_swap_area swap_area;
+                mm_segment_t old_fs;
+                int err;
+                err = get_user(swap_area.offset, &u_swap_area->offset);
+                err |= get_user(swap_area.dev, &u_swap_area->dev);
+                if (err)
+                        return -EFAULT;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
+                                     (unsigned long) &swap_area);
+                set_fs(old_fs);
+                return err;
+        }
+        default:
+                return snapshot_ioctl(file, cmd, arg);
+        }
+}
+#endif /* CONFIG_COMPAT */
 static const struct file_operations snapshot_fops = {
        .open = snapshot_open,
        .release = snapshot_release,
@@ -470,6 +448,9 @@ static const struct file_operations snapshot_fops = {
        .write = snapshot_write,
        .llseek = no_llseek,
        .unlocked_ioctl = snapshot_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl = snapshot_compat_ioctl,
+#endif
 };
 static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 28a40d8171b8..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -100,7 +100,7 @@ static int console_locked, console_suspended;
 * It is also used in interesting ways to provide interlocking in
 * console_unlock();.
 */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 #define LOG_BUF_MASK (log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
                unsigned long mem;
                mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
-                if (mem == MEMBLOCK_ERROR)
+                if (!mem)
                        return;
                new_log_buf = __va(mem);
        } else {
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early)
                return;
        }
-        spin_lock_irqsave(&logbuf_lock, flags);
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early)
        log_start -= offset;
        con_start -= offset;
        log_end -= offset;
-        spin_unlock_irqrestore(&logbuf_lock, flags);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        pr_info("log_buf_len: %d\n", log_buf_len);
        pr_info("early log buf free: %d(%d%%)\n",
@@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                if (error)
                        goto out;
                i = 0;
-                spin_lock_irq(&logbuf_lock);
+                raw_spin_lock_irq(&logbuf_lock);
                while (!error && (log_start != log_end) && i < len) {
                        c = LOG_BUF(log_start);
                        log_start++;
-                        spin_unlock_irq(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
                        error = __put_user(c,buf);
                        buf++;
                        i++;
                        cond_resched();
-                        spin_lock_irq(&logbuf_lock);
+                        raw_spin_lock_irq(&logbuf_lock);
                }
-                spin_unlock_irq(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                if (!error)
                        error = i;
                break;
@@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                count = len;
                if (count > log_buf_len)
                        count = log_buf_len;
-                spin_lock_irq(&logbuf_lock);
+                raw_spin_lock_irq(&logbuf_lock);
                if (count > logged_chars)
                        count = logged_chars;
                if (do_clear)
@@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        if (j + log_buf_len < log_end)
                                break;
                        c = LOG_BUF(j);
-                        spin_unlock_irq(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
                        error = __put_user(c,&buf[count-1-i]);
                        cond_resched();
-                        spin_lock_irq(&logbuf_lock);
+                        raw_spin_lock_irq(&logbuf_lock);
                }
-                spin_unlock_irq(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                if (error)
                        break;
                error = i;
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
        }
 }
-static int __read_mostly ignore_loglevel;
+static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
 {
@@ -532,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str)
 }
 early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+        "print all kernel messages to the console.");
 /*
 * Write out chars from start to end - 1 inclusive
@@ -592,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special)
                /* multi digit including the level and facility number */
                char *endp = NULL;
-                if (p[1] < '0' && p[1] > '9')
-                        return 0;
                lev = (simple_strtoul(&p[1], &endp, 10) & 7);
                if (endp == NULL || endp[0] != '>')
                        return 0;
@@ -688,16 +688,17 @@ static void zap_locks(void)
        oops_timestamp = jiffies;
+        debug_locks_off();
        /* If a crash is occurring, make sure we can't deadlock */
-        spin_lock_init(&logbuf_lock);
+        raw_spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
        sema_init(&console_sem, 1);
 }
 #if defined(CONFIG_PRINTK_TIME)
-static int printk_time = 1;
+static bool printk_time = 1;
 #else
-static int printk_time = 0;
+static bool printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
@@ -802,9 +803,9 @@ static int console_trylock_for_printk(unsigned int cpu)
                }
        }
        printk_cpu = UINT_MAX;
-        spin_unlock(&logbuf_lock);
        if (wake)
                up(&console_sem);
+        raw_spin_unlock(&logbuf_lock);
        return retval;
 }
 static const char recursion_bug_msg [] =
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        boot_delay_msec();
        printk_delay();
-        preempt_disable();
        /* This stops the holder of console_sem just where we want him */
-        raw_local_irq_save(flags);
+        local_irq_save(flags);
        this_cpu = smp_processor_id();
        /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * recursion and return - but flag the recursion so that
                 * it can be printed at the next appropriate moment:
                 */
-                if (!oops_in_progress) {
+                if (!oops_in_progress && !lockdep_recursing(current)) {
                        recursion_bug = 1;
                        goto out_restore_irqs;
                }
@@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        }
        lockdep_off();
-        spin_lock(&logbuf_lock);
+        raw_spin_lock(&logbuf_lock);
        printk_cpu = this_cpu;
        if (recursion_bug) {
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        lockdep_on();
 out_restore_irqs:
-        raw_local_irq_restore(flags);
+        local_irq_restore(flags);
-        preempt_enable();
        return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
        return -1;
 }
-int console_suspend_enabled = 1;
+bool console_suspend_enabled = 1;
 EXPORT_SYMBOL(console_suspend_enabled);
 static int __init console_suspend_disable(char *str)
@@ -1108,6 +1107,10 @@ static int __init console_suspend_disable(char *str)
        return 1;
 }
 __setup("no_console_suspend", console_suspend_disable);
+module_param_named(console_suspend, console_suspend_enabled,
+                bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
+        " and hibernate operations");
 /**
 * suspend_console - suspend the console subsystem
@@ -1257,14 +1260,14 @@ void console_unlock(void)
 again:
        for ( ; ; ) {
-                spin_lock_irqsave(&logbuf_lock, flags);
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
                wake_klogd |= log_start - log_end;
                if (con_start == log_end)
                        break;                  /* Nothing to print */
                _con_start = con_start;
                _log_end = log_end;
                con_start = log_end;            /* Flush */
-                spin_unlock(&logbuf_lock);
+                raw_spin_unlock(&logbuf_lock);
                stop_critical_timings();        /* don't trace print latency */
                call_console_drivers(_con_start, _log_end);
                start_critical_timings();
@@ -1276,7 +1279,7 @@ again:
        if (unlikely(exclusive_console))
                exclusive_console = NULL;
-        spin_unlock(&logbuf_lock);
+        raw_spin_unlock(&logbuf_lock);
        up(&console_sem);
@@ -1286,10 +1289,11 @@ again:
         * there's a new owner and the console_unlock() from them will do the
         * flush, no worries.
         */
-        spin_lock(&logbuf_lock);
+        raw_spin_lock(&logbuf_lock);
        if (con_start != log_end)
                retry = 1;
-        spin_unlock_irqrestore(&logbuf_lock, flags);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        if (retry && console_trylock())
                goto again;
@@ -1522,9 +1526,9 @@ void register_console(struct console *newcon)
                 * console_unlock(); will print out the buffered messages
                 * for us.
                 */
-                spin_lock_irqsave(&logbuf_lock, flags);
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
                con_start = log_start;
-                spin_unlock_irqrestore(&logbuf_lock, flags);
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                /*
                 * We're about to replay the log buffer.  Only do this to the
                 * just-registered console to avoid excessive message spam to
@@ -1731,10 +1735,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
        /* Theoretically, the log could move on after we do this, but
           there's not a lot we can do about that. The new messages
           will overwrite the start of what we dump. */
-        spin_lock_irqsave(&logbuf_lock, flags);
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
        end = log_end & LOG_BUF_MASK;
        chars = logged_chars;
-        spin_unlock_irqrestore(&logbuf_lock, flags);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        if (chars > end) {
                s1 = log_buf + log_buf_len - chars + end;
diff --git a/kernel/profile.c b/kernel/profile.c
index 961b389fe52f..76b8e77773ee 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,7 @@
 *      to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/profile.h>
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c890ac9a7962..00ab2ca5ed11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -8,7 +8,7 @@
 */
 #include <linux/capability.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child)
         */
        if (!(child->flags & PF_EXITING) &&
            (child->signal->flags & SIGNAL_STOP_STOPPED ||
-             child->signal->group_stop_count))
+             child->signal->group_stop_count)) {
                child->jobctl |= JOBCTL_STOP_PENDING;
+                /*
+                 * This is only possible if this thread was cloned by the
+                 * traced task running in the stopped group, set the signal
+                 * for the future reports.
+                 * FIXME: we should change ptrace_init_task() to handle this
+                 * case.
+                 */
+                if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
+                        child->jobctl |= SIGSTOP;
+        }
        /*
         * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
         * @child in the butt.  Note that @resume should be used iff @child
diff --git a/kernel/range.c b/kernel/range.c
index 37fa9b99ad58..9b8ae2d6ed68 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,7 +1,7 @@
 /*
 * Range add and subtract
 */
-#include <linux/module.h>
+#include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sort.h>
diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 000000000000..aa88baab5f78
--- /dev/null
+++ b/kernel/rcu.h
@@ -0,0 +1,92 @@
+/*
+ * Read-Copy Update definitions shared among RCU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2011
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+#ifndef __LINUX_RCU_H
+#define __LINUX_RCU_H
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * Process-level increment to ->dynticks_nesting field.  This allows for
+ * architectures that use half-interrupts and half-exceptions from
+ * process context.
+ */
+#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY   0
+# define STATE_RCU_HEAD_QUEUED  1
+extern struct debug_obj_descr rcuhead_debug_descr;
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+        WARN_ON_ONCE((unsigned long)head & 0x3);
+        debug_object_activate(head, &rcuhead_debug_descr);
+        debug_object_active_state(head, &rcuhead_debug_descr,
+                                  STATE_RCU_HEAD_READY,
+                                  STATE_RCU_HEAD_QUEUED);
+}
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+        debug_object_active_state(head, &rcuhead_debug_descr,
+                                  STATE_RCU_HEAD_QUEUED,
+                                  STATE_RCU_HEAD_READY);
+        debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else   /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+}
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif  /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+extern void kfree(const void *);
+static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
+{
+        unsigned long offset = (unsigned long)head->func;
+        if (__is_kfree_rcu_offset(offset)) {
+                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+                kfree((void *)head - offset);
+        } else {
+                RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+                head->func(head);
+        }
+}
+#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb320be61..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -43,9 +43,14 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/hardirq.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/rcu.h>
+#include "rcu.h"
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
@@ -88,17 +93,24 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
+        if (rcu_is_cpu_idle())
+                return 0;
        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+struct rcu_synchronize {
+        struct rcu_head head;
+        struct completion completion;
+};
 /*
 * Awaken the corresponding synchronize_rcu() instance now that a
 * grace period has elapsed.
 */
-void wakeme_after_rcu(struct rcu_head  *head)
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
        struct rcu_synchronize *rcu;
@@ -106,6 +118,20 @@ void wakeme_after_rcu(struct rcu_head  *head)
        complete(&rcu->completion);
 }
+void wait_rcu_gp(call_rcu_func_t crf)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        crf(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(wait_rcu_gp);
 #ifdef CONFIG_PROVE_RCU
 /*
 * wrapper function to avoid #include problems.
@@ -292,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+{
+        trace_rcu_torture_read(rcutorturename, rhp);
+}
+EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d0f5ab..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -22,13 +22,12 @@
 * For detailed explanation of Read-Copy Update mechanism see -
 *              Documentation/RCU
 */
-#include <linux/moduleparam.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/types.h>
@@ -37,47 +36,154 @@
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
-/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+#ifdef CONFIG_RCU_TRACE
-static struct task_struct *rcu_kthread_task;
+#include <trace/events/rcu.h>
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
-static unsigned long have_rcu_kthread_work;
+#include "rcu.h"
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
-static void invoke_rcu_kthread(void);
+static void invoke_rcu_callbacks(void);
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static int rcu_kthread(void *arg);
+static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
 #include "rcutiny_plugin.h"
-#ifdef CONFIG_NO_HZ
+static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
-static long rcu_dynticks_nesting = 1;
+/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
+static void rcu_idle_enter_common(long long oldval)
+{
+        if (rcu_dynticks_nesting) {
+                RCU_TRACE(trace_rcu_dyntick("--=",
+                                            oldval, rcu_dynticks_nesting));
+                return;
+        }
+        RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
+                                            oldval, rcu_dynticks_nesting));
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+        rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+}
 /*
- * Enter dynticks-idle mode, which is an extended quiescent state
+ * Enter idle, which is an extended quiescent state if we have fully
- * if we have fully entered that mode (i.e., if the new value of
+ * entered that mode (i.e., if the new value of dynticks_nesting is zero).
- * dynticks_nesting is zero).
 */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
 {
-        if (--rcu_dynticks_nesting == 0)
+        unsigned long flags;
-                rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
+        rcu_dynticks_nesting = 0;
+        rcu_idle_enter_common(oldval);
+        local_irq_restore(flags);
 }
 /*
- * Exit dynticks-idle mode, so that we are no longer in an extended
+ * Exit an interrupt handler towards idle.
- * quiescent state.
+ */
+void rcu_irq_exit(void)
+{
+        unsigned long flags;
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
+        rcu_dynticks_nesting--;
+        WARN_ON_ONCE(rcu_dynticks_nesting < 0);
+        rcu_idle_enter_common(oldval);
+        local_irq_restore(flags);
+}
+/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
+static void rcu_idle_exit_common(long long oldval)
+{
+        if (oldval) {
+                RCU_TRACE(trace_rcu_dyntick("++=",
+                                            oldval, rcu_dynticks_nesting));
+                return;
+        }
+        RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+                          oldval, rcu_dynticks_nesting));
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+}
+/*
+ * Exit idle, so that we are no longer in an extended quiescent state.
 */
-void rcu_exit_nohz(void)
+void rcu_idle_exit(void)
 {
+        unsigned long flags;
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
+        WARN_ON_ONCE(oldval != 0);
+        rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+        rcu_idle_exit_common(oldval);
+        local_irq_restore(flags);
+}
+/*
+ * Enter an interrupt handler, moving away from idle.
+ */
+void rcu_irq_enter(void)
+{
+        unsigned long flags;
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
        rcu_dynticks_nesting++;
+        WARN_ON_ONCE(rcu_dynticks_nesting == 0);
+        rcu_idle_exit_common(oldval);
+        local_irq_restore(flags);
+}
+#ifdef CONFIG_PROVE_RCU
+/*
+ * Test whether RCU thinks that the current CPU is idle.
+ */
+int rcu_is_cpu_idle(void)
+{
+        return !rcu_dynticks_nesting;
 }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
-#endif /* #ifdef CONFIG_NO_HZ */
+#endif /* #ifdef CONFIG_PROVE_RCU */
+/*
+ * Test whether the current CPU was interrupted from idle.  Nested
+ * interrupts don't count, we must be running at the first interrupt
+ * level.
+ */
+int rcu_is_cpu_rrupt_from_idle(void)
+{
+        return rcu_dynticks_nesting <= 0;
+}
 /*
 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -96,16 +202,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 }
 /*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
-        have_rcu_kthread_work = 1;
-        wake_up(&rcu_kthread_wq);
-}
-/*
 * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
 * are at it, given that any rcu quiescent state is also an rcu_bh
 * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
@@ -117,7 +213,7 @@ void rcu_sched_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        local_irq_restore(flags);
 }
@@ -130,20 +226,19 @@ void rcu_bh_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        local_irq_restore(flags);
 }
 /*
 * Check to see if the scheduling-clock interrupt came from an extended
- * quiescent state, and, if so, tell RCU about it.
+ * quiescent state, and, if so, tell RCU about it.  This function must
+ * be called from hardirq context.  It is normally called from the
+ * scheduling-clock interrupt.
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        if (user ||
+        if (user || rcu_is_cpu_rrupt_from_idle())
-            (idle_cpu(cpu) &&
-             !in_softirq() &&
-             hardirq_count() <= (1 << HARDIRQ_SHIFT)))
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
@@ -154,18 +249,27 @@ void rcu_check_callbacks(int cpu, int user)
 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
 * whose grace period has elapsed.
 */
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
+        char *rn = NULL;
        struct rcu_head *next, *list;
        unsigned long flags;
        RCU_TRACE(int cb_count = 0);
        /* If no RCU callbacks ready to invoke, just return. */
-        if (&rcp->rcucblist == rcp->donetail)
+        if (&rcp->rcucblist == rcp->donetail) {
+                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
+                                              ACCESS_ONCE(rcp->rcucblist),
+                                              need_resched(),
+                                              is_idle_task(current),
+                                              rcu_is_callbacks_kthread()));
                return;
+        }
        /* Move the ready-to-invoke callbacks to a local list. */
        local_irq_save(flags);
+        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
        list = rcp->rcucblist;
        rcp->rcucblist = *rcp->donetail;
        *rcp->donetail = NULL;
@@ -176,49 +280,28 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        local_irq_restore(flags);
        /* Invoke the callbacks on the local list. */
+        RCU_TRACE(rn = rcp->name);
        while (list) {
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
                local_bh_disable();
-                __rcu_reclaim(list);
+                __rcu_reclaim(rn, list);
                local_bh_enable();
                list = next;
                RCU_TRACE(cb_count++);
        }
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+                                      is_idle_task(current),
+                                      rcu_is_callbacks_kthread()));
 }
-/*
+static void rcu_process_callbacks(struct softirq_action *unused)
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed.  It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that was used previously for this purpose.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
 {
-        unsigned long work;
+        __rcu_process_callbacks(&rcu_sched_ctrlblk);
-        unsigned long morework;
+        __rcu_process_callbacks(&rcu_bh_ctrlblk);
-        unsigned long flags;
+        rcu_preempt_process_callbacks();
-        for (;;) {
-                wait_event_interruptible(rcu_kthread_wq,
-                                         have_rcu_kthread_work != 0);
-                morework = rcu_boost();
-                local_irq_save(flags);
-                work = have_rcu_kthread_work;
-                have_rcu_kthread_work = morework;
-                local_irq_restore(flags);
-                if (work) {
-                        rcu_process_callbacks(&rcu_sched_ctrlblk);
-                        rcu_process_callbacks(&rcu_bh_ctrlblk);
-                        rcu_preempt_process_callbacks();
-                }
-                schedule_timeout_interruptible(1); /* Leave CPU for others. */
-        }
-        return 0;  /* Not reached, but needed to shut gcc up. */
 }
 /*
@@ -280,45 +363,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        __call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_barrier_bh(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_bh(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-void rcu_barrier_sched(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_sched(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-        struct sched_param sp;
-        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
-        sp.sched_priority = RCU_BOOST_PRIO;
-        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
-        return 0;
-}
-early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c676195f..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,32 +23,30 @@
 */
 #include <linux/kthread.h>
+#include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
        struct rcu_head **curtail;      /* ->next pointer of last CB. */
        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+        RCU_TRACE(char *name);          /* Name of RCU type. */
 };
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_sched_ctrlblk = {
        .donetail       = &rcu_sched_ctrlblk.rcucblist,
        .curtail        = &rcu_sched_ctrlblk.rcucblist,
+        RCU_TRACE(.name = "rcu_sched")
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .donetail       = &rcu_bh_ctrlblk.rcucblist,
        .curtail        = &rcu_bh_ctrlblk.rcucblist,
+        RCU_TRACE(.name = "rcu_bh")
 };
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +129,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
        .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
        .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
        .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+        RCU_TRACE(.rcb.name = "rcu_preempt")
 };
 static int rcu_preempted_readers_exp(void);
@@ -247,6 +246,13 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 #include "rtmutex_common.h"
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+/* Controls for rcu_kthread() kthread. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
 /*
 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -306,8 +312,8 @@ static int rcu_boost(void)
        rt_mutex_lock(&mtx);
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        return rcu_preempt_ctrlblk.boost_tasks != NULL ||
+        return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
-               rcu_preempt_ctrlblk.exp_tasks != NULL;
+               ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
 }
 /*
@@ -334,7 +340,7 @@ static int rcu_initiate_boost(void)
                if (rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_preempt_ctrlblk.boost_tasks =
                                rcu_preempt_ctrlblk.gp_tasks;
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        } else
                RCU_TRACE(rcu_initiate_boost_trace());
        return 1;
@@ -353,14 +359,6 @@ static void rcu_preempt_boost_start_gp(void)
 #else /* #ifdef CONFIG_RCU_BOOST */
 /*
- * If there is no RCU priority boosting, we don't boost.
- */
-static int rcu_boost(void)
-{
-        return 0;
-}
-/*
 * If there is no RCU priority boosting, we don't initiate boosting,
 * but we do indicate whether there are blocked readers blocking the
 * current grace period.
@@ -427,7 +425,7 @@ static void rcu_preempt_cpu_qs(void)
        /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
 }
 /*
@@ -648,7 +646,7 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
            rcu_preempt_running_reader())
@@ -674,7 +672,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
 */
 static void rcu_preempt_process_callbacks(void)
 {
-        rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 /*
@@ -697,20 +695,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
-void rcu_barrier(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
 * synchronize_rcu - wait until a grace period has elapsed.
 *
@@ -864,15 +848,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 #endif /* #ifdef CONFIG_RCU_TRACE */
 /*
- * Because preemptible RCU does not exist, it is never necessary to
- * boost preempted RCU readers.
- */
-static int rcu_boost(void)
-{
-        return 0;
-}
-/*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
 */
@@ -898,6 +873,103 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_callbacks(void)
+{
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+}
+#ifdef CONFIG_RCU_TRACE
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+        return rcu_kthread_task == current;
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+/*
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_kthread(void *arg)
+{
+        unsigned long work;
+        unsigned long morework;
+        unsigned long flags;
+        for (;;) {
+                wait_event_interruptible(rcu_kthread_wq,
+                                         have_rcu_kthread_work != 0);
+                morework = rcu_boost();
+                local_irq_save(flags);
+                work = have_rcu_kthread_work;
+                have_rcu_kthread_work = morework;
+                local_irq_restore(flags);
+                if (work)
+                        rcu_process_callbacks(NULL);
+                schedule_timeout_interruptible(1); /* Leave CPU for others. */
+        }
+        return 0;  /* Not reached, but needed to shut gcc up. */
+}
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+        struct sched_param sp;
+        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+        sp.sched_priority = RCU_BOOST_PRIO;
+        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+        return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+#else /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * Start up softirq processing of callbacks.
+ */
+void invoke_rcu_callbacks(void)
+{
+        raise_softirq(RCU_SOFTIRQ);
+}
+#ifdef CONFIG_RCU_TRACE
+/*
+ * There is no callback kthread, so this thread is never it.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+        return false;
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
+void rcu_init(void)
+{
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
@@ -913,12 +985,6 @@ void __init rcu_scheduler_starting(void)
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else /* #ifdef CONFIG_RCU_BOOST */
-#define RCU_BOOST_PRIO 1
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_TRACE
 #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b13bb7e..88f17b8a3b1d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,9 +61,11 @@ static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;         /* Start/stop testing interval (in sec) */
 static int irqreader = 1;       /* RCU readers from irq (timers). */
-static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
+static int fqs_duration;        /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff = 0;     /* Hold time within burst (us). */
+static int fqs_holdoff;         /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
+static int onoff_interval;      /* Wait time between CPU hotplugs, 0=disable. */
+static int shutdown_secs;       /* Shutdown time (s).  <=0 for no shutdown. */
 static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -73,7 +75,7 @@ module_param(nreaders, int, 0444);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
 module_param(nfakewriters, int, 0444);
 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0444);
+module_param(stat_interval, int, 0644);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 module_param(verbose, bool, 0444);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(onoff_interval, int, 0444);
+MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+module_param(shutdown_secs, int, 0444);
+MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
 module_param(test_boost, int, 0444);
 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
 module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *shutdown_task;
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *onoff_task;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #define RCU_TORTURE_PIPE_LEN 10
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
+static long n_offline_attempts;
+static long n_offline_successes;
+static long n_online_attempts;
+static long n_online_successes;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -160,6 +174,8 @@ static int stutter_pause_test;
 #define RCUTORTURE_RUNNABLE_INIT 0
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(rcutorture_runnable, int, 0444);
+MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
 #define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+static unsigned long shutdown_time;     /* jiffies to system shutdown. */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
 */
 static DEFINE_MUTEX(fullstop_mutex);
+/* Forward reference. */
+static void rcu_torture_cleanup(void);
 /*
 * Detect and respond to a system shutdown.
 */
@@ -480,30 +500,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
        call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
 }
-struct rcu_bh_torture_synchronize {
-        struct rcu_head head;
-        struct completion completion;
-};
-static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
-{
-        struct rcu_bh_torture_synchronize *rcu;
-        rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
-        complete(&rcu->completion);
-}
-static void rcu_bh_torture_synchronize(void)
-{
-        struct rcu_bh_torture_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
 static struct rcu_torture_ops rcu_bh_ops = {
        .init           = NULL,
        .cleanup        = NULL,
@@ -512,7 +508,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .readunlock     = rcu_bh_torture_read_unlock,
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_bh_torture_deferred_free,
-        .sync           = rcu_bh_torture_synchronize,
+        .sync           = synchronize_rcu_bh,
        .cb_barrier     = rcu_barrier_bh,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -528,7 +524,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .readunlock     = rcu_bh_torture_read_unlock,
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = rcu_bh_torture_synchronize,
+        .sync           = synchronize_rcu_bh,
        .cb_barrier     = NULL,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -536,6 +532,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .name           = "rcu_bh_sync"
 };
+static struct rcu_torture_ops rcu_bh_expedited_ops = {
+        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
+        .readlock       = rcu_bh_torture_read_lock,
+        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
+        .readunlock     = rcu_bh_torture_read_unlock,
+        .completed      = rcu_bh_torture_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = synchronize_rcu_bh_expedited,
+        .cb_barrier     = NULL,
+        .fqs            = rcu_bh_force_quiescent_state,
+        .stats          = NULL,
+        .irq_capable    = 1,
+        .name           = "rcu_bh_expedited"
+};
 /*
 * Definitions for srcu torture testing.
 */
@@ -620,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
        .name           = "srcu"
 };
+static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
+{
+        return srcu_read_lock_raw(&srcu_ctl);
+}
+static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
+{
+        srcu_read_unlock_raw(&srcu_ctl, idx);
+}
+static struct rcu_torture_ops srcu_raw_ops = {
+        .init           = srcu_torture_init,
+        .cleanup        = srcu_torture_cleanup,
+        .readlock       = srcu_torture_read_lock_raw,
+        .read_delay     = srcu_read_delay,
+        .readunlock     = srcu_torture_read_unlock_raw,
+        .completed      = srcu_torture_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = srcu_torture_synchronize,
+        .cb_barrier     = NULL,
+        .stats          = srcu_torture_stats,
+        .name           = "srcu_raw"
+};
 static void srcu_torture_synchronize_expedited(void)
 {
        synchronize_srcu_expedited(&srcu_ctl);
@@ -659,11 +695,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
        call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
 }
-static void sched_torture_synchronize(void)
-{
-        synchronize_sched();
-}
 static struct rcu_torture_ops sched_ops = {
        .init           = rcu_sync_torture_init,
        .cleanup        = NULL,
@@ -672,7 +703,7 @@ static struct rcu_torture_ops sched_ops = {
        .readunlock     = sched_torture_read_unlock,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sched_torture_deferred_free,
-        .sync           = sched_torture_synchronize,
+        .sync           = synchronize_sched,
        .cb_barrier     = rcu_barrier_sched,
        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
@@ -688,7 +719,7 @@ static struct rcu_torture_ops sched_sync_ops = {
        .readunlock     = sched_torture_read_unlock,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = sched_torture_synchronize,
+        .sync           = synchronize_sched,
        .cb_barrier     = NULL,
        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
@@ -754,7 +785,7 @@ static int rcu_torture_boost(void *arg)
        do {
                /* Wait for the next test interval. */
                oldstarttime = boost_starttime;
-                while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                while (ULONG_CMP_LT(jiffies, oldstarttime)) {
                        schedule_timeout_uninterruptible(1);
                        rcu_stutter_wait("rcu_torture_boost");
                        if (kthread_should_stop() ||
@@ -765,7 +796,7 @@ static int rcu_torture_boost(void *arg)
                /* Do one boost-test interval. */
                endtime = oldstarttime + test_boost_duration * HZ;
                call_rcu_time = jiffies;
-                while (jiffies - endtime > ULONG_MAX / 2) {
+                while (ULONG_CMP_LT(jiffies, endtime)) {
                        /* If we don't have a callback in flight, post one. */
                        if (!rbi.inflight) {
                                smp_mb(); /* RCU core before ->inflight = 1. */
@@ -792,7 +823,8 @@ static int rcu_torture_boost(void *arg)
                 * interval.  Besides, we are running at RT priority,
                 * so delays should be relatively rare.
                 */
-                while (oldstarttime == boost_starttime) {
+                while (oldstarttime == boost_starttime &&
+                       !kthread_should_stop()) {
                        if (mutex_trylock(&boost_mutex)) {
                                boost_starttime = jiffies +
                                                  test_boost_interval * HZ;
@@ -809,11 +841,11 @@ checkwait:	rcu_stutter_wait("rcu_torture_boost");
        /* Clean up and exit. */
        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
-        destroy_rcu_head_on_stack(&rbi.rcu);
        rcutorture_shutdown_absorb("rcu_torture_boost");
        while (!kthread_should_stop() || rbi.inflight)
                schedule_timeout_uninterruptible(1);
        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+        destroy_rcu_head_on_stack(&rbi.rcu);
        return 0;
 }
@@ -831,11 +863,13 @@ rcu_torture_fqs(void *arg)
        VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
        do {
                fqs_resume_time = jiffies + fqs_stutter * HZ;
-                while (jiffies - fqs_resume_time > LONG_MAX) {
+                while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+                       !kthread_should_stop()) {
                        schedule_timeout_interruptible(1);
                }
                fqs_burst_remaining = fqs_duration;
-                while (fqs_burst_remaining > 0) {
+                while (fqs_burst_remaining > 0 &&
+                       !kthread_should_stop()) {
                        cur_ops->fqs();
                        udelay(fqs_holdoff);
                        fqs_burst_remaining -= fqs_holdoff;
@@ -923,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
        return 0;
 }
+void rcutorture_trace_dump(void)
+{
+        static atomic_t beenhere = ATOMIC_INIT(0);
+        if (atomic_read(&beenhere))
+                return;
+        if (atomic_xchg(&beenhere, 1) != 0)
+                return;
+        do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
+        ftrace_dump(DUMP_ALL);
+}
 /*
 * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
 * incrementing the corresponding element of the pipeline array.  The
@@ -944,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
                                  srcu_read_lock_held(&srcu_ctl));
+        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
@@ -961,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
+        if (pipe_count > 1)
+                rcutorture_trace_dump();
        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1004,6 +1053,7 @@ rcu_torture_reader(void *arg)
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
                                          srcu_read_lock_held(&srcu_ctl));
+                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
@@ -1019,6 +1069,8 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
+                if (pipe_count > 1)
+                        rcutorture_trace_dump();
                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1066,7 +1118,8 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
                       "rtmbe: %d rtbke: %ld rtbre: %ld "
-                       "rtbf: %ld rtb: %ld nt: %ld",
+                       "rtbf: %ld rtb: %ld nt: %ld "
+                       "onoff: %ld/%ld:%ld/%ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -1078,7 +1131,11 @@ rcu_torture_printk(char *page)
                       n_rcu_torture_boost_rterror,
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
-                       n_rcu_torture_timers);
+                       n_rcu_torture_timers,
+                       n_online_successes,
+                       n_online_attempts,
+                       n_offline_successes,
+                       n_offline_attempts);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
@@ -1242,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
                "shuffle_interval=%d stutter=%d irqreader=%d "
                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
                "test_boost=%d/%d test_boost_interval=%d "
-                "test_boost_duration=%d\n",
+                "test_boost_duration=%d shutdown_secs=%d "
+                "onoff_interval=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
                test_boost, cur_ops->can_boost,
-                test_boost_interval, test_boost_duration);
+                test_boost_interval, test_boost_duration, shutdown_secs,
+                onoff_interval);
 }
 static struct notifier_block rcutorture_shutdown_nb = {
@@ -1280,8 +1339,9 @@ static int rcutorture_booster_init(int cpu)
        /* Don't allow time recalculation while creating a new task. */
        mutex_lock(&boost_mutex);
        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
-        boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+        boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
-                                          "rcu_torture_boost");
+                                                  cpu_to_node(cpu),
+                                                  "rcu_torture_boost");
        if (IS_ERR(boost_tasks[cpu])) {
                retval = PTR_ERR(boost_tasks[cpu]);
                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1296,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
        return 0;
 }
+/*
+ * Cause the rcutorture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs module parameter.
+ */
+static int
+rcu_torture_shutdown(void *arg)
+{
+        long delta;
+        unsigned long jiffies_snap;
+        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
+        jiffies_snap = ACCESS_ONCE(jiffies);
+        while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+               !kthread_should_stop()) {
+                delta = shutdown_time - jiffies_snap;
+                if (verbose)
+                        printk(KERN_ALERT "%s" TORTURE_FLAG
+                               "rcu_torture_shutdown task: %lu "
+                               "jiffies remaining\n",
+                               torture_type, delta);
+                schedule_timeout_interruptible(delta);
+                jiffies_snap = ACCESS_ONCE(jiffies);
+        }
+        if (kthread_should_stop()) {
+                VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
+                return 0;
+        }
+        /* OK, shut down the system. */
+        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
+        shutdown_task = NULL;   /* Avoid self-kill deadlock. */
+        rcu_torture_cleanup();  /* Get the success/failure message. */
+        kernel_power_off();     /* Shut down the system. */
+        return 0;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+rcu_torture_onoff(void *arg)
+{
+        int cpu;
+        int maxcpu = -1;
+        DEFINE_RCU_RANDOM(rand);
+        VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
+        for_each_online_cpu(cpu)
+                maxcpu = cpu;
+        WARN_ON(maxcpu < 0);
+        while (!kthread_should_stop()) {
+                cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
+                if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+                        if (verbose)
+                                printk(KERN_ALERT "%s" TORTURE_FLAG
+                                       "rcu_torture_onoff task: offlining %d\n",
+                                       torture_type, cpu);
+                        n_offline_attempts++;
+                        if (cpu_down(cpu) == 0) {
+                                if (verbose)
+                                        printk(KERN_ALERT "%s" TORTURE_FLAG
+                                               "rcu_torture_onoff task: "
+                                               "offlined %d\n",
+                                               torture_type, cpu);
+                                n_offline_successes++;
+                        }
+                } else if (cpu_is_hotpluggable(cpu)) {
+                        if (verbose)
+                                printk(KERN_ALERT "%s" TORTURE_FLAG
+                                       "rcu_torture_onoff task: onlining %d\n",
+                                       torture_type, cpu);
+                        n_online_attempts++;
+                        if (cpu_up(cpu) == 0) {
+                                if (verbose)
+                                        printk(KERN_ALERT "%s" TORTURE_FLAG
+                                               "rcu_torture_onoff task: "
+                                               "onlined %d\n",
+                                               torture_type, cpu);
+                                n_online_successes++;
+                        }
+                }
+                schedule_timeout_interruptible(onoff_interval * HZ);
+        }
+        VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
+        return 0;
+}
+static int
+rcu_torture_onoff_init(void)
+{
+        if (onoff_interval <= 0)
+                return 0;
+        onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
+        if (IS_ERR(onoff_task)) {
+                onoff_task = NULL;
+                return PTR_ERR(onoff_task);
+        }
+        return 0;
+}
+static void rcu_torture_onoff_cleanup(void)
+{
+        if (onoff_task == NULL)
+                return;
+        VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
+        kthread_stop(onoff_task);
+}
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static void
+rcu_torture_onoff_init(void)
+{
+}
+static void rcu_torture_onoff_cleanup(void)
+{
+}
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 static int rcutorture_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -1400,6 +1585,11 @@ rcu_torture_cleanup(void)
                for_each_possible_cpu(i)
                        rcutorture_booster_cleanup(i);
        }
+        if (shutdown_task != NULL) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
+                kthread_stop(shutdown_task);
+        }
+        rcu_torture_onoff_cleanup();
        /* Wait for all RCU callbacks to fire.  */
@@ -1424,8 +1614,8 @@ rcu_torture_init(void)
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
-                  &rcu_bh_ops, &rcu_bh_sync_ops,
+                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-                  &srcu_ops, &srcu_expedited_ops,
+                  &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -1616,6 +1806,18 @@ rcu_torture_init(void)
                        }
                }
        }
+        if (shutdown_secs > 0) {
+                shutdown_time = jiffies + shutdown_secs * HZ;
+                shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
+                                            "rcu_torture_shutdown");
+                if (IS_ERR(shutdown_task)) {
+                        firsterr = PTR_ERR(shutdown_task);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
+                        shutdown_task = NULL;
+                        goto unwind;
+                }
+        }
+        rcu_torture_onoff_init();
        register_reboot_notifier(&rcutorture_shutdown_nb);
        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207b1dd3..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
@@ -52,13 +52,16 @@
 #include <linux/prefetch.h>
 #include "rcutree.h"
+#include <trace/events/rcu.h>
+#include "rcu.h"
 /* Data structures. */
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 #define RCU_STATE_INITIALIZER(structname) { \
-        .level = { &structname.node[0] }, \
+        .level = { &structname##_state.node[0] }, \
        .levelcnt = { \
                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
                NUM_RCU_LVL_1, \
@@ -66,20 +69,20 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
                NUM_RCU_LVL_3, \
                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
        }, \
-        .signaled = RCU_GP_IDLE, \
+        .fqs_state = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
-        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
        .name = #structname, \
 }
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
-#define RCU_KTHREAD_PRIO 1      /* RT priority for per-CPU kthreads. */
 /*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
@@ -156,44 +157,50 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 * Note a quiescent state.  Because we do not need to know
 * how many quiescent states passed, just if there was at least
 * one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
 */
 void rcu_sched_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
-        rdp->passed_quiesc_completed = rdp->gpnum - 1;
+        rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-        rdp->passed_quiesc = 1;
+        if (rdp->passed_quiesce == 0)
+                trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+        rdp->passed_quiesce = 1;
 }
 void rcu_bh_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-        rdp->passed_quiesc_completed = rdp->gpnum - 1;
+        rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-        rdp->passed_quiesc = 1;
+        if (rdp->passed_quiesce == 0)
+                trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+        rdp->passed_quiesce = 1;
 }
 /*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
 */
 void rcu_note_context_switch(int cpu)
 {
+        trace_rcu_utilization("Start context switch");
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
+        trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-#ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = 1,
+        .dynticks_nesting = DYNTICK_TASK_NESTING,
        .dynticks = ATOMIC_INIT(1),
 };
-#endif /* #ifdef CONFIG_NO_HZ */
-static int blimit = 10;         /* Maximum callbacks per softirq. */
+static int blimit = 10;         /* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;     /* If this many pending, ignore blimit. */
 static int qlowmark = 100;      /* Once only this many pending, use blimit. */
@@ -314,15 +321,16 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
         * trust its state not to change because interrupts are disabled.
         */
        if (cpu_is_offline(rdp->cpu)) {
+                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
                rdp->offline_fqs++;
                return 1;
        }
-        /* If preemptible RCU, no point in sending reschedule IPI. */
+        /*
-        if (rdp->preemptible)
+         * The CPU is online, so send it a reschedule IPI.  This forces
-                return 0;
+         * it through the scheduler, and (inefficiently) also handles cases
+         * where idle loops fail to inform RCU about the CPU being idle.
-        /* The CPU is online, so send it a reschedule IPI. */
+         */
        if (rdp->cpu != smp_processor_id())
                smp_send_reschedule(rdp->cpu);
        else
@@ -333,64 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 #endif /* #ifdef CONFIG_SMP */
-#ifdef CONFIG_NO_HZ
+/*
+ * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
+{
+        trace_rcu_dyntick("Start", oldval, 0);
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+        rcu_prepare_for_idle(smp_processor_id());
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+        smp_mb__before_atomic_inc();  /* See above. */
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+}
 /**
- * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
 *
- * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * Enter idle mode, in other words, -leave- the mode in which RCU
 * read-side critical sections can occur.  (Though RCU read-side
- * critical sections can occur in irq handlers in nohz mode, a possibility
+ * critical sections can occur in irq handlers in idle, a possibility
- * handled by rcu_irq_enter() and rcu_irq_exit()).
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
 */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
 {
        unsigned long flags;
+        long long oldval;
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        if (--rdtp->dynticks_nesting) {
+        oldval = rdtp->dynticks_nesting;
-                local_irq_restore(flags);
+        rdtp->dynticks_nesting = 0;
-                return;
+        rcu_idle_enter_common(rdtp, oldval);
-        }
-        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        smp_mb__before_atomic_inc();  /* See above. */
-        atomic_inc(&rdtp->dynticks);
-        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
-        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        local_irq_restore(flags);
-        /* If the interrupt queued a callback, get out of dyntick mode. */
-        if (in_irq() &&
-            (__get_cpu_var(rcu_sched_data).nxtlist ||
-             __get_cpu_var(rcu_bh_data).nxtlist ||
-             rcu_preempt_needs_cpu(smp_processor_id())))
-                set_need_resched();
 }
-/*
+/**
- * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
+ *
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit().  If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard.  But very infrequently and irreproducibly.
 *
- * Exit nohz mode, in other words, -enter- the mode in which RCU
+ * Use things like work queues to work around this limitation.
- * read-side critical sections normally occur.
+ *
+ * You have been warned.
 */
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
 {
        unsigned long flags;
+        long long oldval;
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks_nesting++) {
+        oldval = rdtp->dynticks_nesting;
-                local_irq_restore(flags);
+        rdtp->dynticks_nesting--;
-                return;
+        WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
-        }
+        if (rdtp->dynticks_nesting)
+                trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+        else
+                rcu_idle_enter_common(rdtp, oldval);
+        local_irq_restore(flags);
+}
+/*
+ * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
+{
        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
        smp_mb__after_atomic_inc();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+        rcu_cleanup_after_idle(smp_processor_id());
+        trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                trace_rcu_dyntick("Error on exit: not idle task",
+                                  oldval, rdtp->dynticks_nesting);
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+}
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+        unsigned long flags;
+        struct rcu_dynticks *rdtp;
+        long long oldval;
+        local_irq_save(flags);
+        rdtp = &__get_cpu_var(rcu_dynticks);
+        oldval = rdtp->dynticks_nesting;
+        WARN_ON_ONCE(oldval != 0);
+        rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
+        rcu_idle_exit_common(rdtp, oldval);
+        local_irq_restore(flags);
+}
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode!  This code assumes that the idle loop never does upcalls to
+ * user mode.  If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+        unsigned long flags;
+        struct rcu_dynticks *rdtp;
+        long long oldval;
+        local_irq_save(flags);
+        rdtp = &__get_cpu_var(rcu_dynticks);
+        oldval = rdtp->dynticks_nesting;
+        rdtp->dynticks_nesting++;
+        WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+        if (oldval)
+                trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
+        else
+                rcu_idle_exit_common(rdtp, oldval);
        local_irq_restore(flags);
 }
@@ -437,27 +562,37 @@ void rcu_nmi_exit(void)
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
+#ifdef CONFIG_PROVE_RCU
 /**
- * rcu_irq_enter - inform RCU of entry to hard irq context
+ * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
 *
- * If the CPU was idle with dynamic ticks active, this updates the
+ * If the current CPU is in its idle loop and is neither in an interrupt
- * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ * or NMI handler, return true.
 */
-void rcu_irq_enter(void)
+int rcu_is_cpu_idle(void)
 {
-        rcu_exit_nohz();
+        int ret;
+        preempt_disable();
+        ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+        preempt_enable();
+        return ret;
 }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+#endif /* #ifdef CONFIG_PROVE_RCU */
 /**
- * rcu_irq_exit - inform RCU of exit from hard irq context
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
 *
- * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * If the current CPU is idle or running at a first-level (not nested)
- * to put let the RCU handling be aware that the CPU is going back to idle
+ * interrupt from idle, return true.  The caller must have at least
- * with no ticks.
+ * disabled preemption.
 */
-void rcu_irq_exit(void)
+int rcu_is_cpu_rrupt_from_idle(void)
 {
-        rcu_enter_nohz();
+        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
 #ifdef CONFIG_SMP
@@ -470,7 +605,7 @@ void rcu_irq_exit(void)
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-        return 0;
+        return (rdp->dynticks_snap & 0x1) == 0;
 }
 /*
@@ -481,11 +616,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-        unsigned long curr;
+        unsigned int curr;
-        unsigned long snap;
+        unsigned int snap;
-        curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
+        curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
-        snap = (unsigned long)rdp->dynticks_snap;
+        snap = (unsigned int)rdp->dynticks_snap;
        /*
         * If the CPU passed through or entered a dynticks idle phase with
@@ -495,7 +630,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * read-side critical section that started before the beginning
         * of the current RCU grace period.
         */
-        if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
+        if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
                rdp->dynticks_fqs++;
                return 1;
        }
@@ -506,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #endif /* #ifdef CONFIG_SMP */
-#else /* #ifdef CONFIG_NO_HZ */
-#ifdef CONFIG_SMP
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
-{
-        return 0;
-}
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
-{
-        return rcu_implicit_offline_qs(rdp);
-}
-#endif /* #ifdef CONFIG_SMP */
-#endif /* #else #ifdef CONFIG_NO_HZ */
-int rcu_cpu_stall_suppress __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        rsp->gp_start = jiffies;
@@ -537,6 +653,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        int cpu;
        long delta;
        unsigned long flags;
+        int ndetected;
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Only let one CPU complain about others per time interval. */
@@ -553,7 +670,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * Now rat on any tasks that got kicked up to the root rcu_node
         * due to CPU offlining.
         */
-        rcu_print_task_stall(rnp);
+        ndetected = rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
@@ -565,17 +682,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
               rsp->name);
        rcu_for_each_leaf_node(rsp, rnp) {
                raw_spin_lock_irqsave(&rnp->lock, flags);
-                rcu_print_task_stall(rnp);
+                ndetected += rcu_print_task_stall(rnp);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                if (rnp->qsmask == 0)
                        continue;
                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-                        if (rnp->qsmask & (1UL << cpu))
+                        if (rnp->qsmask & (1UL << cpu)) {
                                printk(" %d", rnp->grplo + cpu);
+                                ndetected++;
+                        }
        }
        printk("} (detected by %d, t=%ld jiffies)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start));
-        trigger_all_cpu_backtrace();
+        if (ndetected == 0)
+                printk(KERN_ERR "INFO: Stall ended before state dump start\n");
+        else if (!trigger_all_cpu_backtrace())
+                dump_stack();
        /* If so configured, complain about tasks blocking the grace period. */
@@ -596,7 +718,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
         */
        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
-        trigger_all_cpu_backtrace();
+        if (!trigger_all_cpu_backtrace())
+                dump_stack();
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
@@ -678,9 +801,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                 * go looking for one.
                 */
                rdp->gpnum = rnp->gpnum;
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
                if (rnp->qsmask & rdp->grpmask) {
                        rdp->qs_pending = 1;
-                        rdp->passed_quiesc = 0;
+                        rdp->passed_quiesce = 0;
                } else
                        rdp->qs_pending = 0;
        }
@@ -741,6 +865,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
                /*
                 * If we were in an extended quiescent state, we may have
@@ -826,33 +951,33 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+        if (!rcu_scheduler_fully_active ||
-                if (cpu_needs_another_gp(rsp, rdp))
+            !cpu_needs_another_gp(rsp, rdp)) {
-                        rsp->fqs_need_gp = 1;
+                /*
-                if (rnp->completed == rsp->completed) {
+                 * Either the scheduler hasn't yet spawned the first
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                 * non-idle task or this CPU does not need another
-                        return;
+                 * grace period.  Either way, don't start a new grace
-                }
+                 * period.
-                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+                 */
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;
+        }
+        if (rsp->fqs_active) {
                /*
-                 * Propagate new ->completed value to rcu_node structures
+                 * This CPU needs a grace period, but force_quiescent_state()
-                 * so that other CPUs don't have to wait until the start
+                 * is running.  Tell it to start one on this CPU's behalf.
-                 * of the next grace period to process their callbacks.
                 */
-                rcu_for_each_node_breadth_first(rsp, rnp) {
+                rsp->fqs_need_gp = 1;
-                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        rnp->completed = rsp->completed;
-                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                }
-                local_irq_restore(flags);
                return;
        }
        /* Advance to a new grace period and initialize state. */
        rsp->gpnum++;
-        WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
+        trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
-        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+        WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
+        rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        record_gp_stall_check_time(rsp);
@@ -862,9 +987,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
                rnp->completed = rsp->completed;
-                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+                rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
+                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+                                            rnp->level, rnp->grplo,
+                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -901,12 +1029,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
+                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+                                            rnp->level, rnp->grplo,
+                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
-        rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+        rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
@@ -922,6 +1053,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
        unsigned long gp_duration;
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
@@ -933,8 +1066,42 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        gp_duration = jiffies - rsp->gp_start;
        if (gp_duration > rsp->gp_max)
                rsp->gp_max = gp_duration;
-        rsp->completed = rsp->gpnum;
-        rsp->signaled = RCU_GP_IDLE;
+        /*
+         * We know the grace period is complete, but to everyone else
+         * it appears to still be ongoing.  But it is also the case
+         * that to everyone else it looks like there is nothing that
+         * they can do to advance the grace period.  It is therefore
+         * safe for us to drop the lock in order to mark the grace
+         * period as completed in all of the rcu_node structures.
+         *
+         * But if this CPU needs another grace period, it will take
+         * care of this while initializing the next grace period.
+         * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+         * because the callbacks have not yet been advanced: Those
+         * callbacks are waiting on the grace period that just now
+         * completed.
+         */
+        if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+                /*
+                 * Propagate new ->completed value to rcu_node structures
+                 * so that other CPUs don't have to wait until the start
+                 * of the next grace period to process their callbacks.
+                 */
+                rcu_for_each_node_breadth_first(rsp, rnp) {
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                        rnp->completed = rsp->gpnum;
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                }
+                rnp = rcu_get_root(rsp);
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+        }
+        rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
+        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
+        rsp->fqs_state = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
@@ -962,6 +1129,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        return;
                }
                rnp->qsmask &= ~mask;
+                trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+                                                 mask, rnp->qsmask, rnp->level,
+                                                 rnp->grplo, rnp->grphi,
+                                                 !!rnp->gp_tasks);
                if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                        /* Other bits still set at this level, so done. */
@@ -1000,7 +1171,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 * based on quiescent states detected in an earlier grace period!
 */
 static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
 {
        unsigned long flags;
        unsigned long mask;
@@ -1008,17 +1179,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (lastcomp != rnp->completed) {
+        if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
                /*
-                 * Someone beat us to it for this grace period, so leave.
+                 * The grace period in which this quiescent state was
-                 * The race with GP start is resolved by the fact that we
+                 * recorded has ended, so don't report it upwards.
-                 * hold the leaf rcu_node lock, so that the per-CPU bits
+                 * We will instead need a new quiescent state that lies
-                 * cannot yet be initialized -- so we would simply find our
+                 * within the current grace period.
-                 * CPU's bit already cleared in rcu_report_qs_rnp() if this
-                 * race occurred.
                 */
-                rdp->passed_quiesc = 0; /* try again later! */
+                rdp->passed_quiesce = 0;        /* need qs for new gp. */
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -1062,14 +1231,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-        if (!rdp->passed_quiesc)
+        if (!rdp->passed_quiesce)
                return;
        /*
         * Tell RCU we are done (but rcu_report_qs_rdp() will be the
         * judge of that).
         */
-        rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+        rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1130,11 +1299,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                if (rnp->qsmaskinit != 0) {
                        if (rnp != rdp->mynode)
                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        else
+                                trace_rcu_grace_period(rsp->name,
+                                                       rnp->gpnum + 1 -
+                                                       !!(rnp->qsmask & mask),
+                                                       "cpuofl");
                        break;
                }
-                if (rnp == rdp->mynode)
+                if (rnp == rdp->mynode) {
+                        trace_rcu_grace_period(rsp->name,
+                                               rnp->gpnum + 1 -
+                                               !!(rnp->qsmask & mask),
+                                               "cpuofl");
                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                else
+                } else
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                mask = rnp->grpmask;
                rnp = rnp->parent;
@@ -1153,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        else
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
-                rcu_report_exp_rnp(rsp, rnp);
+                rcu_report_exp_rnp(rsp, rnp, true);
        rcu_node_kthread_setaffinity(rnp, -1);
 }
@@ -1190,17 +1368,24 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int count;
+        int bl, count;
        /* If no callbacks are ready, just return.*/
-        if (!cpu_has_callbacks_ready_to_invoke(rdp))
+        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+                trace_rcu_batch_start(rsp->name, 0, 0);
+                trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+                                    need_resched(), is_idle_task(current),
+                                    rcu_is_callbacks_kthread());
                return;
+        }
        /*
         * Extract the list of ready callbacks, disabling to prevent
         * races with call_rcu() from interrupt handlers.
         */
        local_irq_save(flags);
+        bl = rdp->blimit;
+        trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
        list = rdp->nxtlist;
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1216,13 +1401,19 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-                __rcu_reclaim(list);
+                __rcu_reclaim(rsp->name, list);
                list = next;
-                if (++count >= rdp->blimit)
+                /* Stop only if limit reached and CPU has something to do. */
+                if (++count >= bl &&
+                    (need_resched() ||
+                     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
        }
        local_irq_save(flags);
+        trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+                            is_idle_task(current),
+                            rcu_is_callbacks_kthread());
        /* Update count, and requeue any remaining callbacks. */
        rdp->qlen -= count;
@@ -1250,7 +1441,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_restore(flags);
-        /* Re-raise the RCU softirq if there are callbacks remaining. */
+        /* Re-invoke RCU core processing if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
                invoke_rcu_core();
 }
@@ -1258,17 +1449,16 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
 * Check to see if this CPU is in a non-context-switch quiescent state
 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule the RCU softirq handler.
+ * Also schedule RCU core processing.
 *
- * This function must be called with hardirqs disabled.  It is normally
+ * This function must be called from hardirq context.  It is normally
 * invoked from the scheduling-clock interrupt.  If rcu_pending returns
 * false, there is no point in invoking rcu_check_callbacks().
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        if (user ||
+        trace_rcu_utilization("Start scheduler-tick");
-            (idle_cpu(cpu) && rcu_scheduler_active &&
+        if (user || rcu_is_cpu_rrupt_from_idle()) {
-             !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
                /*
                 * Get here if this CPU took its interrupt from user
@@ -1299,6 +1489,7 @@ void rcu_check_callbacks(int cpu, int user)
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
                invoke_rcu_core();
+        trace_rcu_utilization("End scheduler-tick");
 }
 #ifdef CONFIG_SMP
@@ -1360,10 +1551,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!rcu_gp_in_progress(rsp))
+        trace_rcu_utilization("Start fqs");
+        if (!rcu_gp_in_progress(rsp)) {
+                trace_rcu_utilization("End fqs");
                return;  /* No grace period in progress, nothing to force. */
+        }
        if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+                trace_rcu_utilization("End fqs");
                return; /* Someone else is already on the job. */
        }
        if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1377,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
        rsp->fqs_active = 1;
-        switch (rsp->signaled) {
+        switch (rsp->fqs_state) {
        case RCU_GP_IDLE:
        case RCU_GP_INIT:
@@ -1393,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                force_qs_rnp(rsp, dyntick_save_progress_counter);
                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                if (rcu_gp_in_progress(rsp))
-                        rsp->signaled = RCU_FORCE_QS;
+                        rsp->fqs_state = RCU_FORCE_QS;
                break;
        case RCU_FORCE_QS:
@@ -1412,11 +1607,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
                rsp->fqs_need_gp = 0;
                rcu_start_gp(rsp, flags); /* releases rnp->lock */
+                trace_rcu_utilization("End fqs");
                return;
        }
        raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 unlock_fqs_ret:
        raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
+        trace_rcu_utilization("End fqs");
 }
 #else /* #ifdef CONFIG_SMP */
@@ -1429,9 +1626,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 #endif /* #else #ifdef CONFIG_SMP */
 /*
- * This does the RCU processing work from softirq context for the
+ * This does the RCU core processing work for the specified rcu_state
- * specified rcu_state and rcu_data structures.  This may be called
+ * and rcu_data structures.  This may be called only from the CPU to
- * only from the CPU to whom the rdp belongs.
+ * whom the rdp belongs.
 */
 static void
 __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1468,24 +1665,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
- * Do softirq processing for the current CPU.
+ * Do RCU core processing for the current CPU.
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+        trace_rcu_utilization("Start RCU core");
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
        rcu_preempt_process_callbacks();
+        trace_rcu_utilization("End RCU core");
-        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
-        rcu_needs_cpu_flush();
 }
 /*
- * Wake up the current CPU's kthread.  This replaces raise_softirq()
+ * Schedule RCU callback invocation.  If the specified type of RCU
- * in earlier versions of RCU.  Note that because we are running on
+ * does not support RCU priority boosting, just do a direct call,
- * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
+ * otherwise wake up the per-CPU kernel kthread.  Note that because we
- * cannot disappear out from under us.
+ * are running on the current CPU with interrupts disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
 */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
@@ -1530,6 +1727,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
        rdp->qlen++;
+        if (__is_kfree_rcu_offset((unsigned long)func))
+                trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+                                         rdp->qlen);
+        else
+                trace_rcu_callback(rsp->name, head, rdp->qlen);
        /* If interrupts were disabled, don't dive into RCU core. */
        if (irqs_disabled_flags(flags)) {
                local_irq_restore(flags);
@@ -1613,18 +1816,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 */
 void synchronize_sched(void)
 {
-        struct rcu_synchronize rcu;
        if (rcu_blocking_is_gp())
                return;
+        wait_rcu_gp(call_rcu_sched);
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_sched(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -1639,18 +1833,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
 */
 void synchronize_rcu_bh(void)
 {
-        struct rcu_synchronize rcu;
        if (rcu_blocking_is_gp())
                return;
+        wait_rcu_gp(call_rcu_bh);
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_bh(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
@@ -1671,7 +1856,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        check_cpu_stall(rsp, rdp);
        /* Is the RCU core waiting for a quiescent state from this CPU? */
-        if (rdp->qs_pending && !rdp->passed_quiesc) {
+        if (rcu_scheduler_fully_active &&
+            rdp->qs_pending && !rdp->passed_quiesce) {
                /*
                 * If force_quiescent_state() coming soon and this CPU
@@ -1683,7 +1869,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
                                 jiffies))
                        set_need_resched();
-        } else if (rdp->qs_pending && rdp->passed_quiesc) {
+        } else if (rdp->qs_pending && rdp->passed_quiesce) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -1741,7 +1927,7 @@ static int rcu_pending(int cpu)
 * by the current CPU, even if none need be done immediately, returning
 * 1 if so.
 */
-static int rcu_needs_cpu_quick_check(int cpu)
+static int rcu_cpu_has_callbacks(int cpu)
 {
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1842,10 +2028,11 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
        rdp->qlen = 0;
-#ifdef CONFIG_NO_HZ
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
+        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
+        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
        rdp->cpu = cpu;
+        rdp->rsp = rsp;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -1865,13 +2052,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
-        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->preemptible = preemptible;
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
+        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
+        atomic_set(&rdp->dynticks->dynticks,
+                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+        rcu_prepare_for_idle_init(cpu);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        /*
@@ -1891,9 +2080,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
                rnp->qsmaskinit |= mask;
                mask = rnp->grpmask;
                if (rnp == rdp->mynode) {
-                        rdp->gpnum = rnp->completed; /* if GP in progress... */
+                        /*
+                         * If there is a grace period in progress, we will
+                         * set up to wait for it next time we run the
+                         * RCU core code.
+                         */
+                        rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
-                        rdp->passed_quiesc_completed = rnp->completed - 1;
+                        rdp->passed_quiesce = 0;
+                        rdp->qs_pending = 0;
+                        rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
+                        trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
                }
                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
@@ -1919,6 +2116,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
+        trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
@@ -1944,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                rcu_send_cbs_to_online(&rcu_bh_state);
                rcu_send_cbs_to_online(&rcu_sched_state);
                rcu_preempt_send_cbs_to_online();
+                rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -1954,6 +2153,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        default:
                break;
        }
+        trace_rcu_utilization("End CPU hotplug");
        return NOTIFY_OK;
 }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccda26fb..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
 * Dynticks per-CPU state.
 */
 struct rcu_dynticks {
-        int dynticks_nesting;   /* Track irq/process nesting level. */
+        long long dynticks_nesting; /* Track irq/process nesting level. */
-        int dynticks_nmi_nesting; /* Track NMI nesting level. */
+                                    /* Process level is worth LLONG_MAX/2. */
-        atomic_t dynticks;      /* Even value for dynticks-idle, else odd. */
+        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
+        atomic_t dynticks;          /* Even value for idle, else odd. */
 };
 /* RCU's kthread states for tracing. */
@@ -230,9 +231,9 @@ struct rcu_data {
                                        /*  in order to detect GP end. */
        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
-        unsigned long   passed_quiesc_completed;
+        unsigned long   passed_quiesce_gpnum;
-                                        /* Value of completed at time of qs. */
+                                        /* gpnum at time of quiescent state. */
-        bool            passed_quiesc;  /* User-mode/idle loop etc. */
+        bool            passed_quiesce; /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
        bool            preemptible;    /* Preemptible RCU? */
@@ -274,16 +275,12 @@ struct rcu_data {
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
-#ifdef CONFIG_NO_HZ
        /* 3) dynticks interface. */
        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
        int dynticks_snap;              /* Per-GP tracking for dynticks. */
-#endif /* #ifdef CONFIG_NO_HZ */
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
-#ifdef CONFIG_NO_HZ
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
-#endif /* #ifdef CONFIG_NO_HZ */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
        unsigned long resched_ipi;      /* Sent a resched IPI. */
@@ -299,18 +296,15 @@ struct rcu_data {
        unsigned long n_rp_need_nothing;
        int cpu;
+        struct rcu_state *rsp;
 };
-/* Values for signaled field in struct rcu_state. */
+/* Values for fqs_state field in struct rcu_state. */
 #define RCU_GP_IDLE             0       /* No grace period in progress. */
 #define RCU_GP_INIT             1       /* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK        2       /* Need to scan dyntick state. */
 #define RCU_FORCE_QS            3       /* Need to force quiescent state. */
-#ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
-#else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT         RCU_FORCE_QS
-#endif /* #else #ifdef CONFIG_NO_HZ */
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
@@ -360,7 +354,7 @@ struct rcu_state {
        /* The following fields are guarded by the root rcu_node's lock. */
-        u8      signaled ____cacheline_internodealigned_in_smp;
+        u8      fqs_state ____cacheline_internodealigned_in_smp;
                                                /* Force QS state. */
        u8      fqs_active;                     /* force_quiescent_state() */
                                                /*  is running. */
@@ -417,6 +411,13 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
 #ifndef RCU_TREE_NONCORE
 /* Forward declarations for rcutree_plugin.h */
@@ -430,7 +431,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
-static void rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
@@ -443,17 +444,18 @@ static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
-static void rcu_needs_cpu_flush(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -466,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
 static void __cpuinit rcu_prepare_kthreads(int cpu);
+static void rcu_prepare_for_idle_init(int cpu);
+static void rcu_cleanup_after_idle(int cpu);
+static void rcu_prepare_for_idle(int cpu);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb80b8b0..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -27,6 +27,14 @@
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
+#define RCU_KTHREAD_PRIO 1
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else
+#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
+#endif
 /*
 * Check the RCU kernel configuration parameters and print informative
 * messages about anything out of the ordinary.  If you like #ifdef, you
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-        rdp->passed_quiesc_completed = rdp->gpnum - 1;
+        rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-        rdp->passed_quiesc = 1;
+        if (rdp->passed_quiesce == 0)
+                trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
+        rdp->passed_quiesce = 1;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu)
                        if (rnp->qsmask & rdp->grpmask)
                                rnp->gp_tasks = &t->rcu_node_entry;
                }
+                trace_rcu_preempt_task(rdp->rsp->name,
+                                       t->pid,
+                                       (rnp->qsmask & rdp->grpmask)
+                                       ? rnp->gpnum
+                                       : rnp->gpnum + 1);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else if (t->rcu_read_lock_nesting < 0 &&
                   t->rcu_read_unlock_special) {
@@ -297,8 +312,12 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
+        int empty_exp_now;
        unsigned long flags;
        struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+        struct rt_mutex *rbmp = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
        struct rcu_node *rnp;
        int special;
@@ -344,6 +363,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                np = rcu_next_node_entry(t, rnp);
                list_del_init(&t->rcu_node_entry);
+                t->rcu_blocked_node = NULL;
+                trace_rcu_unlock_preempted_task("rcu_preempt",
+                                                rnp->gpnum, t->pid);
                if (&t->rcu_node_entry == rnp->gp_tasks)
                        rnp->gp_tasks = np;
                if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -351,38 +373,44 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                if (&t->rcu_node_entry == rnp->boost_tasks)
                        rnp->boost_tasks = np;
-                /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+                /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
-                if (t->rcu_boosted) {
+                if (t->rcu_boost_mutex) {
-                        special |= RCU_READ_UNLOCK_BOOSTED;
+                        rbmp = t->rcu_boost_mutex;
-                        t->rcu_boosted = 0;
+                        t->rcu_boost_mutex = NULL;
                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
-                t->rcu_blocked_node = NULL;
                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
-                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
+                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
+                 * so we must take a snapshot of the expedited state.
                 */
-                if (empty)
+                empty_exp_now = !rcu_preempted_readers_exp(rnp);
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
-                else
+                        trace_rcu_quiescent_state_report("preempt_rcu",
+                                                         rnp->gpnum,
+                                                         0, rnp->qsmask,
+                                                         rnp->level,
+                                                         rnp->grplo,
+                                                         rnp->grphi,
+                                                         !!rnp->gp_tasks);
                        rcu_report_unblock_qs_rnp(rnp, flags);
+                } else
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-                if (special & RCU_READ_UNLOCK_BOOSTED) {
+                if (rbmp)
-                        rt_mutex_unlock(t->rcu_boost_mutex);
+                        rt_mutex_unlock(rbmp);
-                        t->rcu_boost_mutex = NULL;
-                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
                 * If this was the last task on the expedited lists,
                 * then we need to report up the rcu_node hierarchy.
                 */
-                if (!empty_exp && !rcu_preempted_readers_exp(rnp))
+                if (!empty_exp && empty_exp_now)
-                        rcu_report_exp_rnp(&rcu_preempt_state, rnp);
+                        rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
        } else {
                local_irq_restore(flags);
        }
@@ -399,10 +427,10 @@ void __rcu_read_unlock(void)
 {
        struct task_struct *t = current;
-        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
        if (t->rcu_read_lock_nesting != 1)
                --t->rcu_read_lock_nesting;
        else {
+                barrier();  /* critical section before exit code. */
                t->rcu_read_lock_nesting = INT_MIN;
                barrier();  /* assign before ->rcu_read_unlock_special load */
                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
@@ -466,16 +494,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
 */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
        struct task_struct *t;
+        int ndetected = 0;
        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                return;
+                return 0;
        t = list_entry(rnp->gp_tasks,
                       struct task_struct, rcu_node_entry);
-        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
                printk(" P%d", t->pid);
+                ndetected++;
+        }
+        return ndetected;
 }
 /*
@@ -656,18 +688,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
 */
 void synchronize_rcu(void)
 {
-        struct rcu_synchronize rcu;
        if (!rcu_scheduler_active)
                return;
+        wait_rcu_gp(call_rcu);
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -709,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 * recursively up the tree.  (Calm down, calm down, we do the recursion
 * iteratively!)
 *
+ * Most callers will set the "wake" flag, but the task initiating the
+ * expedited grace period need not wake itself.
+ *
 * Caller must hold sync_rcu_preempt_exp_mutex.
 */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake)
 {
        unsigned long flags;
        unsigned long mask;
@@ -724,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
                }
                if (rnp->parent == NULL) {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        wake_up(&sync_rcu_preempt_exp_wq);
+                        if (wake)
+                                wake_up(&sync_rcu_preempt_exp_wq);
                        break;
                }
                mask = rnp->grpmask;
@@ -757,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
                must_wait = 1;
        }
        if (!must_wait)
-                rcu_report_exp_rnp(rsp, rnp);
+                rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 }
 /*
@@ -968,8 +996,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
+        return 0;
 }
 /*
@@ -1048,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 * report on tasks preempted in RCU read-side critical sections during
 * expedited RCU grace periods.
 */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake)
 {
-        return;
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1199,12 +1228,12 @@ static int rcu_boost(struct rcu_node *rnp)
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
        t->rcu_boost_mutex = &mtx;
-        t->rcu_boosted = 1;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
+               ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
 /*
@@ -1228,9 +1257,12 @@ static int rcu_boost_kthread(void *arg)
        int spincnt = 0;
        int more2boost;
+        trace_rcu_utilization("Start boost kthread@init");
        for (;;) {
                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+                trace_rcu_utilization("End boost kthread@rcu_wait");
                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+                trace_rcu_utilization("Start boost kthread@rcu_wait");
                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
                more2boost = rcu_boost(rnp);
                if (more2boost)
@@ -1238,11 +1270,14 @@ static int rcu_boost_kthread(void *arg)
                else
                        spincnt = 0;
                if (spincnt > 10) {
+                        trace_rcu_utilization("End boost kthread@rcu_yield");
                        rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+                        trace_rcu_utilization("Start boost kthread@rcu_yield");
                        spincnt = 0;
                }
        }
        /* NOTREACHED */
+        trace_rcu_utilization("End boost kthread@notreached");
        return 0;
 }
@@ -1291,15 +1326,22 @@ static void invoke_rcu_callbacks_kthread(void)
        local_irq_save(flags);
        __this_cpu_write(rcu_cpu_has_work, 1);
-        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+        if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
-                local_irq_restore(flags);
+            current != __this_cpu_read(rcu_cpu_kthread_task))
-                return;
+                wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
-        }
-        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
        local_irq_restore(flags);
 }
 /*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+        return __get_cpu_var(rcu_cpu_kthread_task) == current;
+}
+/*
 * Set the affinity of the boost kthread.  The CPU-hotplug locks are
 * held, so no one should be messing with the existence of the boost
 * kthread.
@@ -1343,13 +1385,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        if (rnp->boost_kthread_task != NULL)
                return 0;
        t = kthread_create(rcu_boost_kthread, (void *)rnp,
-                           "rcub%d", rnp_index);
+                           "rcub/%d", rnp_index);
        if (IS_ERR(t))
                return PTR_ERR(t);
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rnp->boost_kthread_task = t;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sp.sched_priority = RCU_BOOST_PRIO;
        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
        return 0;
@@ -1444,6 +1486,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
 {
        struct sched_param sp;
        struct timer_list yield_timer;
+        int prio = current->rt_priority;
        setup_timer_on_stack(&yield_timer, f, arg);
        mod_timer(&yield_timer, jiffies + 2);
@@ -1451,7 +1494,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
        set_user_nice(current, 19);
        schedule();
-        sp.sched_priority = RCU_KTHREAD_PRIO;
+        set_user_nice(current, 0);
+        sp.sched_priority = prio;
        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
        del_timer(&yield_timer);
 }
@@ -1489,7 +1533,8 @@ static int rcu_cpu_kthread_should_stop(int cpu)
 /*
 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * earlier RCU softirq.
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
 */
 static int rcu_cpu_kthread(void *arg)
 {
@@ -1500,9 +1545,12 @@ static int rcu_cpu_kthread(void *arg)
        char work;
        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+        trace_rcu_utilization("Start CPU kthread@init");
        for (;;) {
                *statusp = RCU_KTHREAD_WAITING;
+                trace_rcu_utilization("End CPU kthread@rcu_wait");
                rcu_wait(*workp != 0 || kthread_should_stop());
+                trace_rcu_utilization("Start CPU kthread@rcu_wait");
                local_bh_disable();
                if (rcu_cpu_kthread_should_stop(cpu)) {
                        local_bh_enable();
@@ -1523,11 +1571,14 @@ static int rcu_cpu_kthread(void *arg)
                        spincnt = 0;
                if (spincnt > 10) {
                        *statusp = RCU_KTHREAD_YIELDING;
+                        trace_rcu_utilization("End CPU kthread@rcu_yield");
                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+                        trace_rcu_utilization("Start CPU kthread@rcu_yield");
                        spincnt = 0;
                }
        }
        *statusp = RCU_KTHREAD_STOPPED;
+        trace_rcu_utilization("End CPU kthread@term");
        return 0;
 }
@@ -1560,7 +1611,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
        if (!rcu_scheduler_fully_active ||
            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
                return 0;
-        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+        t = kthread_create_on_node(rcu_cpu_kthread,
+                                   (void *)(long)cpu,
+                                   cpu_to_node(cpu),
+                                   "rcuc/%d", cpu);
        if (IS_ERR(t))
                return PTR_ERR(t);
        if (cpu_online(cpu))
@@ -1669,7 +1723,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
                return 0;
        if (rnp->node_kthread_task == NULL) {
                t = kthread_create(rcu_node_kthread, (void *)rnp,
-                                   "rcun%d", rnp_index);
+                                   "rcun/%d", rnp_index);
                if (IS_ERR(t))
                        return PTR_ERR(t);
                raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1731,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
        WARN_ON_ONCE(1);
 }
+static bool rcu_is_callbacks_kthread(void)
+{
+        return false;
+}
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
@@ -1866,7 +1925,7 @@ void synchronize_sched_expedited(void)
                 * grace period works for us.
                 */
                get_online_cpus();
-                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                snap = atomic_read(&sync_sched_expedited_started);
                smp_mb(); /* ensure read is before try_stop_cpus(). */
        }
@@ -1898,113 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 * 1 if so.  This function is part of the RCU implementation; it is -not-
 * an exported member of the RCU API.
 *
- * Because we have preemptible RCU, just check whether this CPU needs
+ * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
- * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
+ * any flavor of RCU.
- * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
 */
 int rcu_needs_cpu(int cpu)
 {
-        return rcu_needs_cpu_quick_check(cpu);
+        return rcu_cpu_has_callbacks(cpu);
+}
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
 }
 /*
- * Check to see if we need to continue a callback-flush operations to
+ * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
+ * after it.
- * entry is not configured, so we never do need to.
 */
-static void rcu_needs_cpu_flush(void)
+static void rcu_cleanup_after_idle(int cpu)
+{
+}
+/*
+ * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
+ * is nothing.
+ */
+static void rcu_prepare_for_idle(int cpu)
 {
 }
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#define RCU_NEEDS_CPU_FLUSHES 5
+/*
+ * This code is invoked when a CPU goes idle, at which point we want
+ * to have the CPU do everything required for RCU so that it can enter
+ * the energy-efficient dyntick-idle mode.  This is handled by a
+ * state machine implemented by rcu_prepare_for_idle() below.
+ *
+ * The following three proprocessor symbols control this state machine:
+ *
+ * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
+ *      to satisfy RCU.  Beyond this point, it is better to incur a periodic
+ *      scheduling-clock interrupt than to loop through the state machine
+ *      at full power.
+ * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
+ *      optional if RCU does not need anything immediately from this
+ *      CPU, even if this CPU still has RCU callbacks queued.  The first
+ *      times through the state machine are mandatory: we need to give
+ *      the state machine a chance to communicate a quiescent state
+ *      to the RCU core.
+ * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
+ *      to sleep in dyntick-idle mode with RCU callbacks pending.  This
+ *      is sized to be roughly one RCU grace period.  Those energy-efficiency
+ *      benchmarkers who might otherwise be tempted to set this to a large
+ *      number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
+ *      system.  And if you are -that- concerned about energy efficiency,
+ *      just power the system down and be done with it!
+ *
+ * The values below work well in practice.  If future workloads require
+ * adjustment, they can be converted into kernel config parameters, though
+ * making the state machine smarter might be a better option.
+ */
+#define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
+#define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
+#define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
 static DEFINE_PER_CPU(int, rcu_dyntick_drain);
 static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
+static ktime_t rcu_idle_gp_wait;
 /*
- * Check to see if any future RCU-related work will need to be done
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * by the current CPU, even if none need be done immediately, returning
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * an exported member of the RCU API.
+ * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ */
+int rcu_needs_cpu(int cpu)
+{
+        /* If no callbacks, RCU doesn't need the CPU. */
+        if (!rcu_cpu_has_callbacks(cpu))
+                return 0;
+        /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
+        return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
+}
+/*
+ * Timer handler used to force CPU to start pushing its remaining RCU
+ * callbacks in the case where it entered dyntick-idle mode with callbacks
+ * pending.  The hander doesn't really need to do anything because the
+ * real work is done upon re-entry to idle, or by the next scheduling-clock
+ * interrupt should idle not be re-entered.
+ */
+static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+{
+        trace_rcu_prep_idle("Timer");
+        return HRTIMER_NORESTART;
+}
+/*
+ * Initialize the timer used to pull CPUs out of dyntick-idle mode.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+        static int firsttime = 1;
+        struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+        hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hrtp->function = rcu_idle_gp_timer_func;
+        if (firsttime) {
+                unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
+                rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
+                firsttime = 0;
+        }
+}
+/*
+ * Clean up for exit from idle.  Because we are exiting from idle, there
+ * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * do nothing if this timer is not active, so just cancel it unconditionally.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+        hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+}
+/*
+ * Check to see if any RCU-related work can be done by the current CPU,
+ * and if so, schedule a softirq to get it done.  This function is part
+ * of the RCU implementation; it is -not- an exported member of the RCU API.
 *
- * Because we are not supporting preemptible RCU, attempt to accelerate
+ * The idea is for the current CPU to clear out all work required by the
- * any current grace periods so that RCU no longer needs this CPU, but
+ * RCU core for the current grace period, so that this CPU can be permitted
- * only if all other CPUs are already in dynticks-idle mode.  This will
+ * to enter dyntick-idle mode.  In some cases, it will need to be awakened
- * allow the CPU cores to be powered down immediately, as opposed to after
+ * at the end of the grace period by whatever CPU ends the grace period.
- * waiting many milliseconds for grace periods to elapse.
+ * This allows CPUs to go dyntick-idle more quickly, and to reduce the
+ * number of wakeups by a modest integer factor.
 *
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
 * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ *
+ * The caller must have disabled interrupts.
 */
-int rcu_needs_cpu(int cpu)
+static void rcu_prepare_for_idle(int cpu)
 {
-        int c = 0;
+        unsigned long flags;
-        int snap;
-        int thatcpu;
+        local_irq_save(flags);
-        /* Check for being in the holdoff period. */
+        /*
-        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+         * If there are no callbacks on this CPU, enter dyntick-idle mode.
-                return rcu_needs_cpu_quick_check(cpu);
+         * Also reset state to avoid prejudicing later attempts.
+         */
-        /* Don't bother unless we are the last non-dyntick-idle CPU. */
+        if (!rcu_cpu_has_callbacks(cpu)) {
-        for_each_online_cpu(thatcpu) {
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
-                if (thatcpu == cpu)
+                per_cpu(rcu_dyntick_drain, cpu) = 0;
-                        continue;
+                local_irq_restore(flags);
-                snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
+                trace_rcu_prep_idle("No callbacks");
-                                                     thatcpu).dynticks);
+                return;
-                smp_mb(); /* Order sampling of snap with end of grace period. */
+        }
-                if ((snap & 0x1) != 0) {
-                        per_cpu(rcu_dyntick_drain, cpu) = 0;
+        /*
-                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+         * If in holdoff mode, just return.  We will presumably have
-                        return rcu_needs_cpu_quick_check(cpu);
+         * refrained from disabling the scheduling-clock tick.
-                }
+         */
+        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("In holdoff");
+                return;
        }
        /* Check and update the rcu_dyntick_drain sequencing. */
        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
                /* First time through, initialize the counter. */
-                per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+                per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
+        } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+                   !rcu_pending(cpu)) {
+                /* Can we go dyntick-idle despite still having callbacks? */
+                trace_rcu_prep_idle("Dyntick with callbacks");
+                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                              rcu_idle_gp_wait, HRTIMER_MODE_REL);
+                return; /* Nothing more to do immediately. */
        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
                /* We have hit the limit, so time to give up. */
                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-                return rcu_needs_cpu_quick_check(cpu);
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("Begin holdoff");
+                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
+                return;
        }
-        /* Do one step pushing remaining RCU callbacks through. */
+        /*
+         * Do one step of pushing the remaining RCU callbacks through
+         * the RCU core state machine.
+         */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+        if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
+                local_irq_restore(flags);
+                rcu_preempt_qs(cpu);
+                force_quiescent_state(&rcu_preempt_state, 0);
+                local_irq_save(flags);
+        }
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+                local_irq_restore(flags);
                rcu_sched_qs(cpu);
                force_quiescent_state(&rcu_sched_state, 0);
-                c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+                local_irq_save(flags);
        }
        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+                local_irq_restore(flags);
                rcu_bh_qs(cpu);
                force_quiescent_state(&rcu_bh_state, 0);
-                c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+                local_irq_save(flags);
        }
-        /* If RCU callbacks are still pending, RCU still needs this CPU. */
+        /*
-        if (c)
+         * If RCU callbacks are still pending, RCU still needs this CPU.
+         * So try forcing the callbacks through the grace period.
+         */
+        if (rcu_cpu_has_callbacks(cpu)) {
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("More callbacks");
                invoke_rcu_core();
-        return c;
+        } else {
-}
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("Callbacks drained");
-/*
+        }
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.
- */
-static void rcu_needs_cpu_flush(void)
-{
-        int cpu = smp_processor_id();
-        unsigned long flags;
-        if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
-                return;
-        local_irq_save(flags);
-        (void)rcu_needs_cpu(cpu);
-        local_irq_restore(flags);
 }
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c0986afc0..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -48,11 +48,6 @@
 #ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
 static char convert_kthread_status(unsigned int kthread_status)
 {
        if (kthread_status > RCU_KTHREAD_MAX)
@@ -66,19 +61,17 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
+        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->completed, rdp->gpnum,
-                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
+        seq_printf(m, " dt=%d/%llx/%d df=%lu",
-        seq_printf(m, " dt=%d/%d/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
        seq_printf(m, " ql=%ld qs=%c%c%c%c",
                   rdp->qlen,
@@ -144,15 +137,13 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
                   rdp->completed, rdp->gpnum,
-                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
+        seq_printf(m, ",%d,%llx,%d,%lu",
-        seq_printf(m, ",%d,%d,%d,%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -175,10 +166,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
+        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
-#ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
 #ifdef CONFIG_RCU_BOOST
        seq_puts(m, "\"kt\",\"ktl\"");
@@ -283,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
-                   rsp->completed, gpnum, rsp->signaled,
+                   rsp->completed, gpnum, rsp->fqs_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/relay.c b/kernel/relay.c
index 859ea5a9605f..4335e1d7ee2d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/string.h>
 #include <linux/relay.h>
 #include <linux/vmalloc.h>
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
 */
 static struct dentry *create_buf_file_default_callback(const char *filename,
                                                       struct dentry *parent,
-                                                       int mode,
+                                                       umode_t mode,
                                                       struct rchan_buf *buf,
                                                       int *is_global)
 {
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..6d269cce7aa1 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf,
                return 0;
        }
-        /* FIXME - make memparse() take const char* args */
+        *res = memparse(buf, &end);
-        *res = memparse((char *)buf, &end);
        if (*end != '\0')
                return -EINVAL;
diff --git a/kernel/resource.c b/kernel/resource.c
index c8dc249da5ce..7640b3a947d0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,7 @@
 * Arbitrary resource management.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 3c7cbc2c33be..16502d3a71c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -18,7 +18,7 @@
 */
 #include <linux/sched.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/syscalls.h>
@@ -29,61 +29,6 @@
 #include "rtmutex_common.h"
-# define TRACE_WARN_ON(x)                       WARN_ON(x)
-# define TRACE_BUG_ON(x)                        BUG_ON(x)
-# define TRACE_OFF()                                            \
-do {                                                            \
-        if (rt_trace_on) {                                      \
-                rt_trace_on = 0;                                \
-                console_verbose();                              \
-                if (raw_spin_is_locked(&current->pi_lock))      \
-                        raw_spin_unlock(&current->pi_lock);     \
-        }                                                       \
-} while (0)
-# define TRACE_OFF_NOLOCK()                                     \
-do {                                                            \
-        if (rt_trace_on) {                                      \
-                rt_trace_on = 0;                                \
-                console_verbose();                              \
-        }                                                       \
-} while (0)
-# define TRACE_BUG_LOCKED()                     \
-do {                                            \
-        TRACE_OFF();                            \
-        BUG();                                  \
-} while (0)
-# define TRACE_WARN_ON_LOCKED(c)                \
-do {                                            \
-        if (unlikely(c)) {                      \
-                TRACE_OFF();                    \
-                WARN_ON(1);                     \
-        }                                       \
-} while (0)
-# define TRACE_BUG_ON_LOCKED(c)                 \
-do {                                            \
-        if (unlikely(c))                        \
-                TRACE_BUG_LOCKED();             \
-} while (0)
-#ifdef CONFIG_SMP
-# define SMP_TRACE_BUG_ON_LOCKED(c)     TRACE_BUG_ON_LOCKED(c)
-#else
-# define SMP_TRACE_BUG_ON_LOCKED(c)     do { } while (0)
-#endif
-/*
- * deadlock detection flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-static int rt_trace_on = 1;
 static void printk_task(struct task_struct *p)
 {
        if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-        WARN_ON(!plist_head_empty(&task->pi_waiters));
+        DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
-        WARN_ON(task->pi_blocked_on);
+        DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 /*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
 {
        struct task_struct *task;
-        if (!rt_trace_on || detect || !act_waiter)
+        if (!debug_locks || detect || !act_waiter)
                return;
        task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 {
        struct task_struct *task;
-        if (!waiter->deadlock_lock || !rt_trace_on)
+        if (!waiter->deadlock_lock || !debug_locks)
                return;
        rcu_read_lock();
@@ -149,10 +94,14 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
                return;
        }
-        TRACE_OFF_NOLOCK();
+        if (!debug_locks_off()) {
+                rcu_read_unlock();
+                return;
+        }
        printk("\n============================================\n");
        printk(  "[ BUG: circular locking deadlock detected! ]\n");
+        printk("%s\n", print_tainted());
        printk(  "--------------------------------------------\n");
        printk("%s/%d is deadlocking current task %s/%d\n\n",
               task->comm, task_pid_nr(task),
@@ -180,7 +129,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
        printk("[ turning off deadlock detection."
               "Please report this trace. ]\n\n");
-        local_irq_disable();
 }
 void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +137,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-        TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+        DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 }
 void
@@ -199,7 +147,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-        TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+        DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
 }
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,8 +161,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
        put_pid(waiter->deadlock_task_pid);
-        TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
-        TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
        memset(waiter, 0x22, sizeof(*waiter));
 }
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 5c9ccd380966..98ec49475460 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
 *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 */
+#include <linux/device.h>
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
-#include <linux/sysdev.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
@@ -27,7 +27,7 @@ struct test_thread_data {
        int                     opdata;
        int                     mutexes[MAX_RT_TEST_MUTEXES];
        int                     event;
-        struct sys_device       sysdev;
+        struct device           dev;
 };
 static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
 *
 * opcode:data
 */
-static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
                                  const char *buf, size_t count)
 {
        struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
        char cmdbuf[32];
        int op, dat, tid, ret;
-        td = container_of(dev, struct test_thread_data, sysdev);
+        td = container_of(dev, struct test_thread_data, dev);
-        tid = td->sysdev.id;
+        tid = td->dev.id;
        /* strings from sysfs write are not 0 terminated! */
        if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
 * @dev:        thread to query
 * @buf:        char buffer to be filled with thread status info
 */
-static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
                                 char *buf)
 {
        struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
        char *curr = buf;
        int i;
-        td = container_of(dev, struct test_thread_data, sysdev);
+        td = container_of(dev, struct test_thread_data, dev);
-        tsk = threads[td->sysdev.id];
+        tsk = threads[td->dev.id];
        spin_lock(&rttest_lock);
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
        spin_unlock(&rttest_lock);
        curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
-                        mutexes[td->sysdev.id].owner);
+                        mutexes[td->dev.id].owner);
        return curr - buf;
 }
-static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
+static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
-static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
-static struct sysdev_class rttest_sysclass = {
+static struct bus_type rttest_subsys = {
        .name = "rttest",
+        .dev_name = "rttest",
 };
 static int init_test_thread(int id)
 {
-        thread_data[id].sysdev.cls = &rttest_sysclass;
+        thread_data[id].dev.bus = &rttest_subsys;
-        thread_data[id].sysdev.id = id;
+        thread_data[id].dev.id = id;
        threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
        if (IS_ERR(threads[id]))
                return PTR_ERR(threads[id]);
-        return sysdev_register(&thread_data[id].sysdev);
+        return device_register(&thread_data[id].dev);
 }
 static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
        for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
                rt_mutex_init(&mutexes[i]);
-        ret = sysdev_class_register(&rttest_sysclass);
+        ret = subsys_system_register(&rttest_subsys, NULL);
        if (ret)
                return ret;
@@ -401,10 +402,10 @@ static int init_rttest(void)
                ret = init_test_thread(i);
                if (ret)
                        break;
-                ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+                ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
                if (ret)
                        break;
-                ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+                ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
                if (ret)
                        break;
        }
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e1662acdb..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -11,7 +11,7 @@
 *  See Documentation/rt-mutex-design.txt for details.
 */
 #include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9f48f3d82e9b..b152f74f02de 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -7,7 +7,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/rwsem.h>
 #include <asm/system.h>
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_clock.o = -pg
+endif
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only.  Why this used to be enabled for all architectures is beyond
+# me.  I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
+obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
+#include "sched.h"
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+#include <linux/security.h>
+#include <linux/export.h>
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
 static struct autogroup autogroup_default;
 static atomic_t autogroup_seq_nr;
-static void __init autogroup_init(struct task_struct *init_task)
+void __init autogroup_init(struct task_struct *init_task)
 {
        autogroup_default.tg = &root_task_group;
        kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
        init_task->signal->autogroup = &autogroup_default;
 }
-static inline void autogroup_free(struct task_group *tg)
+void autogroup_free(struct task_group *tg)
 {
        kfree(tg->autogroup);
 }
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
        return ag;
 }
-#ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg);
-#endif
 static inline struct autogroup *autogroup_create(void)
 {
        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
        return autogroup_kref_get(&autogroup_default);
 }
-static inline bool
+bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
-task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
        if (tg != &root_task_group)
                return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
        return true;
 }
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-        return !!tg->autogroup;
-}
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-        if (enabled && task_wants_autogroup(p, tg))
-                return p->signal->autogroup->tg;
-        return tg;
-}
 static void
 autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 {
@@ -263,7 +246,7 @@ out:
 #endif /* CONFIG_PROC_FS */
 #ifdef CONFIG_SCHED_DEBUG
-static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
        if (!task_group_is_autogroup(tg))
                return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/kref.h>
+#include <linux/rwsem.h>
 struct autogroup {
        /*
         * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
        int                     nice;
 };
-static inline bool task_group_is_autogroup(struct task_group *tg);
+extern void autogroup_init(struct task_struct *init_task);
+extern void autogroup_free(struct task_group *tg);
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return !!tg->autogroup;
+}
+extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg);
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (enabled && task_wants_autogroup(p, tg))
+                return p->signal->autogroup->tg;
+        return tg;
+}
+extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
 #else /* !CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index 9d8af0b3fb64..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
@@ -62,7 +62,7 @@
 */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
diff --git a/kernel/sched.c b/kernel/sched/core.c
index 5670028a9c16..df00cb09263e 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
 /*
- *  kernel/sched.c
+ *  kernel/sched/core.c
 *
 *  Kernel scheduler and related syscalls
 *
@@ -56,7 +56,6 @@
 #include <linux/percpu.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -71,593 +70,46 @@
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
+#include <linux/init_task.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
-#include <asm/mutex.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
-#include "sched_cpupri.h"
+#include "sched.h"
-#include "workqueue_sched.h"
+#include "../workqueue_sched.h"
-#include "sched_autogroup.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
-/*
+void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
-/*
- * Helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-#define NICE_0_LOAD             SCHED_LOAD_SCALE
-#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define DEF_TIMESLICE           (100 * HZ / 1000)
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF     ((u64)~0ULL)
-static inline int rt_policy(int policy)
-{
-        if (policy == SCHED_FIFO || policy == SCHED_RR)
-                return 1;
-        return 0;
-}
-static inline int task_has_rt_policy(struct task_struct *p)
-{
-        return rt_policy(p->policy);
-}
-/*
- * This is the priority-queue data structure of the RT scheduling class:
- */
-struct rt_prio_array {
-        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-        struct list_head queue[MAX_RT_PRIO];
-};
-struct rt_bandwidth {
-        /* nests inside the rq lock: */
-        raw_spinlock_t          rt_runtime_lock;
-        ktime_t                 rt_period;
-        u64                     rt_runtime;
-        struct hrtimer          rt_period_timer;
-};
-static struct rt_bandwidth def_rt_bandwidth;
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
-        struct rt_bandwidth *rt_b =
-                container_of(timer, struct rt_bandwidth, rt_period_timer);
-        ktime_t now;
-        int overrun;
-        int idle = 0;
-        for (;;) {
-                now = hrtimer_cb_get_time(timer);
-                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-                if (!overrun)
-                        break;
-                idle = do_sched_rt_period_timer(rt_b, overrun);
-        }
-        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-static
-void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
-{
-        rt_b->rt_period = ns_to_ktime(period);
-        rt_b->rt_runtime = runtime;
-        raw_spin_lock_init(&rt_b->rt_runtime_lock);
-        hrtimer_init(&rt_b->rt_period_timer,
-                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rt_b->rt_period_timer.function = sched_rt_period_timer;
-}
-static inline int rt_bandwidth_enabled(void)
 {
-        return sysctl_sched_rt_runtime >= 0;
+        unsigned long delta;
-}
+        ktime_t soft, hard, now;
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
-        ktime_t now;
-        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
-                return;
-        if (hrtimer_active(&rt_b->rt_period_timer))
-                return;
-        raw_spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
-                unsigned long delta;
+                if (hrtimer_active(period_timer))
-                ktime_t soft, hard;
-                if (hrtimer_active(&rt_b->rt_period_timer))
                        break;
-                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
+                now = hrtimer_cb_get_time(period_timer);
-                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
+                hrtimer_forward(period_timer, now, period);
-                soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+                soft = hrtimer_get_softexpires(period_timer);
-                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+                hard = hrtimer_get_expires(period_timer);
                delta = ktime_to_ns(ktime_sub(hard, soft));
-                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+                __hrtimer_start_range_ns(period_timer, soft, delta,
-                                HRTIMER_MODE_ABS_PINNED, 0);
+                                         HRTIMER_MODE_ABS_PINNED, 0);
        }
-        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
-#ifdef CONFIG_RT_GROUP_SCHED
+DEFINE_MUTEX(sched_domains_mutex);
-static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-{
-        hrtimer_cancel(&rt_b->rt_period_timer);
-}
-#endif
-/*
- * sched_domains_mutex serializes calls to init_sched_domains,
- * detach_destroy_domains and partition_sched_domains.
- */
-static DEFINE_MUTEX(sched_domains_mutex);
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-struct cfs_rq;
-static LIST_HEAD(task_groups);
-/* task group related information */
-struct task_group {
-        struct cgroup_subsys_state css;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        /* schedulable entities of this group on each cpu */
-        struct sched_entity **se;
-        /* runqueue "owned" by this group on each cpu */
-        struct cfs_rq **cfs_rq;
-        unsigned long shares;
-        atomic_t load_weight;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        struct sched_rt_entity **rt_se;
-        struct rt_rq **rt_rq;
-        struct rt_bandwidth rt_bandwidth;
-#endif
-        struct rcu_head rcu;
-        struct list_head list;
-        struct task_group *parent;
-        struct list_head siblings;
-        struct list_head children;
-#ifdef CONFIG_SCHED_AUTOGROUP
-        struct autogroup *autogroup;
-#endif
-};
-/* task_group_lock serializes the addition/removal of task groups */
-static DEFINE_SPINLOCK(task_group_lock);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-# define ROOT_TASK_GROUP_LOAD   NICE_0_LOAD
-/*
- * A weight of 0 or 1 can cause arithmetics problems.
- * A weight of a cfs_rq is the sum of weights of which entities
- * are queued on this cfs_rq, so a weight of a entity should not be
- * too large, so as the shares value of a task group.
- * (The default weight is 1024 - so there's no practical
- *  limitation from this.)
- */
-#define MIN_SHARES      (1UL <<  1)
-#define MAX_SHARES      (1UL << 18)
-static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
-#endif
-/* Default task group.
- *      Every task in system belong to this group at bootup.
- */
-struct task_group root_task_group;
-#endif  /* CONFIG_CGROUP_SCHED */
-/* CFS-related fields in a runqueue */
-struct cfs_rq {
-        struct load_weight load;
-        unsigned long nr_running;
-        u64 exec_clock;
-        u64 min_vruntime;
-#ifndef CONFIG_64BIT
-        u64 min_vruntime_copy;
-#endif
-        struct rb_root tasks_timeline;
-        struct rb_node *rb_leftmost;
-        struct list_head tasks;
-        struct list_head *balance_iterator;
-        /*
-         * 'curr' points to currently running entity on this cfs_rq.
-         * It is set to NULL otherwise (i.e when none are currently running).
-         */
-        struct sched_entity *curr, *next, *last, *skip;
-#ifdef  CONFIG_SCHED_DEBUG
-        unsigned int nr_spread_over;
-#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
-        /*
-         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
-         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
-         * (like users, containers etc.)
-         *
-         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-         * list is used during load balance.
-         */
-        int on_list;
-        struct list_head leaf_cfs_rq_list;
-        struct task_group *tg;  /* group that "owns" this runqueue */
-#ifdef CONFIG_SMP
-        /*
-         * the part of load.weight contributed by tasks
-         */
-        unsigned long task_weight;
-        /*
-         *   h_load = weight * f(tg)
-         *
-         * Where f(tg) is the recursive weight fraction assigned to
-         * this group.
-         */
-        unsigned long h_load;
-        /*
-         * Maintaining per-cpu shares distribution for group scheduling
-         *
-         * load_stamp is the last time we updated the load average
-         * load_last is the last time we updated the load average and saw load
-         * load_unacc_exec_time is currently unaccounted execution time
-         */
-        u64 load_avg;
-        u64 load_period;
-        u64 load_stamp, load_last, load_unacc_exec_time;
-        unsigned long load_contribution;
-#endif
-#endif
-};
-/* Real-Time classes' related field in a runqueue: */
-struct rt_rq {
-        struct rt_prio_array active;
-        unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        struct {
-                int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
-                int next; /* next highest */
-#endif
-        } highest_prio;
-#endif
-#ifdef CONFIG_SMP
-        unsigned long rt_nr_migratory;
-        unsigned long rt_nr_total;
-        int overloaded;
-        struct plist_head pushable_tasks;
-#endif
-        int rt_throttled;
-        u64 rt_time;
-        u64 rt_runtime;
-        /* Nests inside the rq lock: */
-        raw_spinlock_t rt_runtime_lock;
-#ifdef CONFIG_RT_GROUP_SCHED
-        unsigned long rt_nr_boosted;
-        struct rq *rq;
-        struct list_head leaf_rt_rq_list;
-        struct task_group *tg;
-#endif
-};
-#ifdef CONFIG_SMP
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
-        atomic_t refcount;
-        atomic_t rto_count;
-        struct rcu_head rcu;
-        cpumask_var_t span;
-        cpumask_var_t online;
-        /*
-         * The "RT overload" flag: it gets set if a CPU has more than
-         * one runnable RT task.
-         */
-        cpumask_var_t rto_mask;
-        struct cpupri cpupri;
-};
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-#endif /* CONFIG_SMP */
-/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
-struct rq {
-        /* runqueue lock: */
-        raw_spinlock_t lock;
-        /*
-         * nr_running and cpu_load should be in the same cacheline because
-         * remote CPUs use both these fields when doing load calculation.
-         */
-        unsigned long nr_running;
-        #define CPU_LOAD_IDX_MAX 5
-        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-        unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
-        u64 nohz_stamp;
-        unsigned char nohz_balance_kick;
-#endif
-        int skip_clock_update;
-        /* capture load from *all* tasks on this cpu: */
-        struct load_weight load;
-        unsigned long nr_load_updates;
-        u64 nr_switches;
-        struct cfs_rq cfs;
-        struct rt_rq rt;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        /* list of leaf cfs_rq on this cpu: */
-        struct list_head leaf_cfs_rq_list;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        struct list_head leaf_rt_rq_list;
-#endif
-        /*
-         * This is part of a global counter where only the total sum
-         * over all CPUs matters. A task can increase this counter on
-         * one CPU and if it got migrated afterwards it may decrease
-         * it on another CPU. Always updated under the runqueue lock:
-         */
-        unsigned long nr_uninterruptible;
-        struct task_struct *curr, *idle, *stop;
-        unsigned long next_balance;
-        struct mm_struct *prev_mm;
-        u64 clock;
-        u64 clock_task;
-        atomic_t nr_iowait;
-#ifdef CONFIG_SMP
-        struct root_domain *rd;
-        struct sched_domain *sd;
-        unsigned long cpu_power;
-        unsigned char idle_at_tick;
-        /* For active balancing */
-        int post_schedule;
-        int active_balance;
-        int push_cpu;
-        struct cpu_stop_work active_balance_work;
-        /* cpu of this runqueue: */
-        int cpu;
-        int online;
-        unsigned long avg_load_per_task;
-        u64 rt_avg;
-        u64 age_stamp;
-        u64 idle_stamp;
-        u64 avg_idle;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-        u64 prev_irq_time;
-#endif
-#ifdef CONFIG_PARAVIRT
-        u64 prev_steal_time;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        u64 prev_steal_time_rq;
-#endif
-        /* calc_load related fields */
-        unsigned long calc_load_update;
-        long calc_load_active;
-#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
-        int hrtick_csd_pending;
-        struct call_single_data hrtick_csd;
-#endif
-        struct hrtimer hrtick_timer;
-#endif
-#ifdef CONFIG_SCHEDSTATS
-        /* latency stats */
-        struct sched_info rq_sched_info;
-        unsigned long long rq_cpu_time;
-        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-        /* sys_sched_yield() stats */
-        unsigned int yld_count;
-        /* schedule() stats */
-        unsigned int sched_switch;
-        unsigned int sched_count;
-        unsigned int sched_goidle;
-        /* try_to_wake_up() stats */
-        unsigned int ttwu_count;
-        unsigned int ttwu_local;
-#endif
-#ifdef CONFIG_SMP
-        struct task_struct *wake_list;
-#endif
-};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-static inline int cpu_of(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-        return rq->cpu;
-#else
-        return 0;
-#endif
-}
-#define rcu_dereference_check_sched_domain(p) \
-        rcu_dereference_check((p), \
-                              lockdep_is_held(&sched_domains_mutex))
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
-        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
-#define this_rq()               (&__get_cpu_var(runqueues))
-#define task_rq(p)              cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
-#define raw_rq()                (&__raw_get_cpu_var(runqueues))
-#ifdef CONFIG_CGROUP_SCHED
-/*
- * Return the group to which this tasks belongs.
- *
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
- */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        struct task_group *tg;
-        struct cgroup_subsys_state *css;
-        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock) ||
-                        lockdep_is_held(&task_rq(p)->lock));
-        tg = container_of(css, struct task_group, css);
-        return autogroup_task_group(p, tg);
-}
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-        p->se.parent = task_group(p)->se[cpu];
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-        p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-}
-#else /* CONFIG_CGROUP_SCHED */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        return NULL;
-}
-#endif /* CONFIG_CGROUP_SCHED */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
-static void update_rq_clock(struct rq *rq)
+void update_rq_clock(struct rq *rq)
 {
        s64 delta;
@@ -670,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
 }
 /*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- */
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug static const
-#endif
-/**
- * runqueue_is_locked - Returns true if the current cpu runqueue is locked
- * @cpu: the processor in question.
- *
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-int runqueue_is_locked(int cpu)
-{
-        return raw_spin_is_locked(&cpu_rq(cpu)->lock);
-}
-/*
 * Debugging: various feature bits
 */
 #define SCHED_FEAT(name, enabled)       \
-        __SCHED_FEAT_##name ,
-enum {
-#include "sched_features.h"
-};
-#undef SCHED_FEAT
-#define SCHED_FEAT(name, enabled)       \
        (1UL << __SCHED_FEAT_##name) * enabled |
 const_debug unsigned int sysctl_sched_features =
-#include "sched_features.h"
+#include "features.h"
        0;
 #undef SCHED_FEAT
@@ -717,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
        #name ,
 static __read_mostly char *sched_feat_names[] = {
-#include "sched_features.h"
+#include "features.h"
        NULL
 };
@@ -727,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
 {
        int i;
-        for (i = 0; sched_feat_names[i]; i++) {
+        for (i = 0; i < __SCHED_FEAT_NR; i++) {
                if (!(sysctl_sched_features & (1UL << i)))
                        seq_puts(m, "NO_");
                seq_printf(m, "%s ", sched_feat_names[i]);
@@ -737,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
        return 0;
 }
+#ifdef HAVE_JUMP_LABEL
+#define jump_label_key__true  jump_label_key_enabled
+#define jump_label_key__false jump_label_key_disabled
+#define SCHED_FEAT(name, enabled)       \
+        jump_label_key__##enabled ,
+struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+#undef SCHED_FEAT
+static void sched_feat_disable(int i)
+{
+        if (jump_label_enabled(&sched_feat_keys[i]))
+                jump_label_dec(&sched_feat_keys[i]);
+}
+static void sched_feat_enable(int i)
+{
+        if (!jump_label_enabled(&sched_feat_keys[i]))
+                jump_label_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
@@ -760,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                cmp += 3;
        }
-        for (i = 0; sched_feat_names[i]; i++) {
+        for (i = 0; i < __SCHED_FEAT_NR; i++) {
                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                        if (neg)
+                        if (neg) {
                                sysctl_sched_features &= ~(1UL << i);
-                        else
+                                sched_feat_disable(i);
+                        } else {
                                sysctl_sched_features |= (1UL << i);
+                                sched_feat_enable(i);
+                        }
                        break;
                }
        }
-        if (!sched_feat_names[i])
+        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
        *ppos += cnt;
@@ -799,10 +254,7 @@ static __init int sched_init_debug(void)
        return 0;
 }
 late_initcall(sched_init_debug);
+#endif /* CONFIG_SCHED_DEBUG */
-#endif
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 /*
 * Number of tasks to iterate in a single balance run.
@@ -824,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 */
 unsigned int sysctl_sched_rt_period = 1000000;
-static __read_mostly int scheduler_running;
+__read_mostly int scheduler_running;
 /*
 * part of the period that we allow rt tasks to run in us.
@@ -832,112 +284,7 @@ static __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
-static inline u64 global_rt_period(void)
-{
-        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
-}
-static inline u64 global_rt_runtime(void)
-{
-        if (sysctl_sched_rt_runtime < 0)
-                return RUNTIME_INF;
-        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-}
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)      do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)       do { } while (0)
-#endif
-static inline int task_current(struct rq *rq, struct task_struct *p)
-{
-        return rq->curr == p;
-}
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->on_cpu;
-#else
-        return task_current(rq, p);
-#endif
-}
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
-        /* this is a valid case when another task releases the spinlock */
-        rq->lock.owner = current;
-#endif
-        /*
-         * If we are tracking spinlock dependencies then we have to
-         * fix up the runqueue lock - which gets 'carried over' from
-         * prev into current:
-         */
-        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-        raw_spin_unlock_irq(&rq->lock);
-}
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-        raw_spin_unlock_irq(&rq->lock);
-#else
-        raw_spin_unlock(&rq->lock);
-#endif
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-        local_irq_enable();
-#endif
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
 * __task_rq_lock - lock the rq @p resides on.
@@ -1020,20 +367,6 @@ static struct rq *this_rq_lock(void)
 * rq->lock.
 */
-/*
- * Use hrtick when:
- *  - enabled by features
- *  - hrtimer is actually high res
- */
-static inline int hrtick_enabled(struct rq *rq)
-{
-        if (!sched_feat(HRTICK))
-                return 0;
-        if (!cpu_active(cpu_of(rq)))
-                return 0;
-        return hrtimer_is_hres_active(&rq->hrtick_timer);
-}
 static void hrtick_clear(struct rq *rq)
 {
        if (hrtimer_active(&rq->hrtick_timer))
@@ -1077,7 +410,7 @@ static void __hrtick_start(void *arg)
 *
 * called with rq->lock held and irqs disabled
 */
-static void hrtick_start(struct rq *rq, u64 delay)
+void hrtick_start(struct rq *rq, u64 delay)
 {
        struct hrtimer *timer = &rq->hrtick_timer;
        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1121,7 +454,7 @@ static __init void init_hrtick(void)
 *
 * called with rq->lock held and irqs disabled
 */
-static void hrtick_start(struct rq *rq, u64 delay)
+void hrtick_start(struct rq *rq, u64 delay)
 {
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                        HRTIMER_MODE_REL_PINNED, 0);
@@ -1172,7 +505,7 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
-static void resched_task(struct task_struct *p)
+void resched_task(struct task_struct *p)
 {
        int cpu;
@@ -1193,7 +526,7 @@ static void resched_task(struct task_struct *p)
                smp_send_reschedule(cpu);
 }
-static void resched_cpu(int cpu)
+void resched_cpu(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
@@ -1272,14 +605,22 @@ void wake_up_idle_cpu(int cpu)
                smp_send_reschedule(cpu);
 }
-#endif /* CONFIG_NO_HZ */
+static inline bool got_nohz_idle_kick(void)
+{
+        int cpu = smp_processor_id();
+        return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+}
-static u64 sched_avg_period(void)
+#else /* CONFIG_NO_HZ */
+static inline bool got_nohz_idle_kick(void)
 {
-        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+        return false;
 }
-static void sched_avg_update(struct rq *rq)
+#endif /* CONFIG_NO_HZ */
+void sched_avg_update(struct rq *rq)
 {
        s64 period = sched_avg_period();
@@ -1295,200 +636,34 @@ static void sched_avg_update(struct rq *rq)
        }
 }
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-        rq->rt_avg += rt_delta;
-        sched_avg_update(rq);
-}
 #else /* !CONFIG_SMP */
-static void resched_task(struct task_struct *p)
+void resched_task(struct task_struct *p)
 {
        assert_raw_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
 }
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-}
-static void sched_avg_update(struct rq *rq)
-{
-}
 #endif /* CONFIG_SMP */
-#if BITS_PER_LONG == 32
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
-# define WMULT_CONST    (~0UL)
+                        (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
-#define WMULT_SHIFT     32
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-/*
- * delta *= weight / lw
- */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
-{
-        u64 tmp;
-        /*
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
-                unsigned long w = scale_load_down(lw->weight);
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-                        lw->inv_weight = 1;
-                else if (unlikely(!w))
-                        lw->inv_weight = WMULT_CONST;
-                else
-                        lw->inv_weight = WMULT_CONST / w;
-        }
-        /*
-         * Check whether we'd overflow the 64-bit multiplication:
-         */
-        if (unlikely(tmp > WMULT_CONST))
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                        WMULT_SHIFT/2);
-        else
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
-}
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
-        lw->weight += inc;
-        lw->inv_weight = 0;
-}
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
-        lw->weight -= dec;
-        lw->inv_weight = 0;
-}
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
-        lw->weight = w;
-        lw->inv_weight = 0;
-}
 /*
- * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
- * of tasks with abnormal "nice" values across CPUs the contribution that
+ * node and @up when leaving it for the final time.
- * each task makes to its run queue's load is weighted according to its
- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
- * scaled version of the new time slice allocation that they receive on time
- * slice expiry etc.
- */
-#define WEIGHT_IDLEPRIO                3
-#define WMULT_IDLEPRIO         1431655765
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
 *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * Caller must hold rcu_lock or sufficient equivalent.
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
-};
-/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
-        CPUACCT_STAT_USER,      /* ... user mode */
-        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
-        CPUACCT_STAT_NSTATS,
-};
-#ifdef CONFIG_CGROUP_CPUACCT
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-static void cpuacct_update_stats(struct task_struct *tsk,
-                enum cpuacct_stat_index idx, cputime_t val);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-static inline void cpuacct_update_stats(struct task_struct *tsk,
-                enum cpuacct_stat_index idx, cputime_t val) {}
-#endif
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_add(&rq->load, load);
-}
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_sub(&rq->load, load);
-}
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
-typedef int (*tg_visitor)(struct task_group *, void *);
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
 */
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+int walk_tg_tree_from(struct task_group *from,
+                             tg_visitor down, tg_visitor up, void *data)
 {
        struct task_group *parent, *child;
        int ret;
-        rcu_read_lock();
+        parent = from;
-        parent = &root_task_group;
 down:
        ret = (*down)(parent, data);
        if (ret)
-                goto out_unlock;
+                goto out;
        list_for_each_entry_rcu(child, &parent->children, siblings) {
                parent = child;
                goto down;
@@ -1497,273 +672,24 @@ up:
                continue;
        }
        ret = (*up)(parent, data);
-        if (ret)
+        if (ret || parent == from)
-                goto out_unlock;
+                goto out;
        child = parent;
        parent = parent->parent;
        if (parent)
                goto up;
-out_unlock:
+out:
-        rcu_read_unlock();
        return ret;
 }
-static int tg_nop(struct task_group *tg, void *data)
+int tg_nop(struct task_group *tg, void *data)
 {
        return 0;
 }
 #endif
-#ifdef CONFIG_SMP
+void update_cpu_load(struct rq *this_rq);
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-        return cpu_rq(cpu)->load.weight;
-}
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return min(rq->cpu_load[type-1], total);
-}
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return max(rq->cpu_load[type-1], total);
-}
-static unsigned long power_of(int cpu)
-{
-        return cpu_rq(cpu)->cpu_power;
-}
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
-        if (nr_running)
-                rq->avg_load_per_task = rq->load.weight / nr_running;
-        else
-                rq->avg_load_per_task = 0;
-        return rq->avg_load_per_task;
-}
-#ifdef CONFIG_PREEMPT
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-/*
- * fair double_lock_balance: Safely acquires both rq->locks in a fair
- * way at the expense of forcing extra atomic operations in all
- * invocations.  This assures that the double_lock is acquired using the
- * same underlying policy as the spinlock_t on this architecture, which
- * reduces latency compared to the unfair variant below.  However, it
- * also adds more overhead and therefore may reduce throughput.
- */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(this_rq->lock)
-        __acquires(busiest->lock)
-        __acquires(this_rq->lock)
-{
-        raw_spin_unlock(&this_rq->lock);
-        double_rq_lock(this_rq, busiest);
-        return 1;
-}
-#else
-/*
- * Unfair double_lock_balance: Optimizes throughput at the expense of
- * latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry.  This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
- * regardless of entry order into the function.
- */
-static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(this_rq->lock)
-        __acquires(busiest->lock)
-        __acquires(this_rq->lock)
-{
-        int ret = 0;
-        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
-                if (busiest < this_rq) {
-                        raw_spin_unlock(&this_rq->lock);
-                        raw_spin_lock(&busiest->lock);
-                        raw_spin_lock_nested(&this_rq->lock,
-                                              SINGLE_DEPTH_NESTING);
-                        ret = 1;
-                } else
-                        raw_spin_lock_nested(&busiest->lock,
-                                              SINGLE_DEPTH_NESTING);
-        }
-        return ret;
-}
-#endif /* CONFIG_PREEMPT */
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-{
-        if (unlikely(!irqs_disabled())) {
-                /* printk() doesn't work good under rq->lock */
-                raw_spin_unlock(&this_rq->lock);
-                BUG_ON(1);
-        }
-        return _double_lock_balance(this_rq, busiest);
-}
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(busiest->lock)
-{
-        raw_spin_unlock(&busiest->lock);
-        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        if (rq1 == rq2) {
-                raw_spin_lock(&rq1->lock);
-                __acquire(rq2->lock);   /* Fake it out ;) */
-        } else {
-                if (rq1 < rq2) {
-                        raw_spin_lock(&rq1->lock);
-                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-                } else {
-                        raw_spin_lock(&rq2->lock);
-                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-                }
-        }
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        raw_spin_unlock(&rq1->lock);
-        if (rq1 != rq2)
-                raw_spin_unlock(&rq2->lock);
-        else
-                __release(rq2->lock);
-}
-#else /* CONFIG_SMP */
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        BUG_ON(rq1 != rq2);
-        raw_spin_lock(&rq1->lock);
-        __acquire(rq2->lock);   /* Fake it out ;) */
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        BUG_ON(rq1 != rq2);
-        raw_spin_unlock(&rq1->lock);
-        __release(rq2->lock);
-}
-#endif
-static void calc_load_account_idle(struct rq *this_rq);
-static void update_sysctl(void);
-static int get_update_sysctl_factor(void);
-static void update_cpu_load(struct rq *this_rq);
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-        set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-        /*
-         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfuly executed on another CPU. We must ensure that updates of
-         * per-task data have been completed by this moment.
-         */
-        smp_wmb();
-        task_thread_info(p)->cpu = cpu;
-#endif
-}
-static const struct sched_class rt_sched_class;
-#define sched_class_highest (&stop_sched_class)
-#define for_each_class(class) \
-   for (class = sched_class_highest; class; class = class->next)
-#include "sched_stats.h"
-static void inc_nr_running(struct rq *rq)
-{
-        rq->nr_running++;
-}
-static void dec_nr_running(struct rq *rq)
-{
-        rq->nr_running--;
-}
 static void set_load_weight(struct task_struct *p)
 {
@@ -1800,25 +726,23 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 /*
 * activate_task - move a task to the runqueue.
 */
-static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, flags);
-        inc_nr_running(rq);
 }
 /*
 * deactivate_task - remove a task from the runqueue.
 */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, flags);
-        dec_nr_running(rq);
 }
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2004,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
        u64 latest_ns;
        int ret = 0;
        local_irq_save(flags);
        latest_ns = this_cpu_read(cpu_hardirq_time);
-        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
                ret = 1;
        local_irq_restore(flags);
        return ret;
@@ -2019,14 +943,14 @@ static int irqtime_account_hi_update(void)
 static int irqtime_account_si_update(void)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
        u64 latest_ns;
        int ret = 0;
        local_irq_save(flags);
        latest_ns = this_cpu_read(cpu_softirq_time);
-        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
                ret = 1;
        local_irq_restore(flags);
        return ret;
@@ -2038,15 +962,6 @@ static int irqtime_account_si_update(void)
 #endif
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#include "sched_autogroup.c"
-#include "sched_stoptask.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2144,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                p->sched_class->prio_changed(rq, p, oldprio);
 }
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
        const struct sched_class *class;
@@ -2170,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
-/*
- * Is this task likely cache-hot:
- */
-static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
-        s64 delta;
-        if (p->sched_class != &fair_sched_class)
-                return 0;
-        if (unlikely(p->policy == SCHED_IDLE))
-                return 0;
-        /*
-         * Buddy candidates are cache hot:
-         */
-        if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
-                        (&p->se == cfs_rq_of(&p->se)->next ||
-                         &p->se == cfs_rq_of(&p->se)->last))
-                return 1;
-        if (sysctl_sched_migration_cost == -1)
-                return 1;
-        if (sysctl_sched_migration_cost == 0)
-                return 0;
-        delta = now - p->se.exec_start;
-        return delta < (s64)sysctl_sched_migration_cost;
-}
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -2390,11 +1273,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
        /* Look for allowed, online CPU in same node. */
        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
-                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                        return dest_cpu;
        /* Any allowed, online CPU? */
-        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+        dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
        if (dest_cpu < nr_cpu_ids)
                return dest_cpu;
@@ -2431,7 +1314,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
         * [ this allows ->select_task() to simply return task_cpu(p) and
         *   not worry about this generic constraint ]
         */
-        if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+        if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
                     !cpu_online(cpu)))
                cpu = select_fallback_rq(task_cpu(p), p);
@@ -2556,42 +1439,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 #ifdef CONFIG_SMP
-static void sched_ttwu_do_pending(struct task_struct *list)
+static void sched_ttwu_pending(void)
 {
        struct rq *rq = this_rq();
+        struct llist_node *llist = llist_del_all(&rq->wake_list);
+        struct task_struct *p;
        raw_spin_lock(&rq->lock);
-        while (list) {
+        while (llist) {
-                struct task_struct *p = list;
+                p = llist_entry(llist, struct task_struct, wake_entry);
-                list = list->wake_entry;
+                llist = llist_next(llist);
                ttwu_do_activate(rq, p, 0);
        }
        raw_spin_unlock(&rq->lock);
 }
-#ifdef CONFIG_HOTPLUG_CPU
-static void sched_ttwu_pending(void)
-{
-        struct rq *rq = this_rq();
-        struct task_struct *list = xchg(&rq->wake_list, NULL);
-        if (!list)
-                return;
-        sched_ttwu_do_pending(list);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
 void scheduler_ipi(void)
 {
-        struct rq *rq = this_rq();
+        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-        struct task_struct *list = xchg(&rq->wake_list, NULL);
-        if (!list)
                return;
        /*
@@ -2608,25 +1475,21 @@ void scheduler_ipi(void)
         * somewhat pessimize the simple resched case.
         */
        irq_enter();
-        sched_ttwu_do_pending(list);
+        sched_ttwu_pending();
+        /*
+         * Check if someone kicked us for doing the nohz idle load balance.
+         */
+        if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+                this_rq()->idle_balance = 1;
+                raise_softirq_irqoff(SCHED_SOFTIRQ);
+        }
        irq_exit();
 }
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
-        struct rq *rq = cpu_rq(cpu);
+        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
-        struct task_struct *next = rq->wake_list;
-        for (;;) {
-                struct task_struct *old = next;
-                p->wake_entry = next;
-                next = cmpxchg(&rq->wake_list, old, p);
-                if (next == old)
-                        break;
-        }
-        if (!next)
                smp_send_reschedule(cpu);
 }
@@ -2648,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+{
+        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
 #endif /* CONFIG_SMP */
 static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2655,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
        struct rq *rq = cpu_rq(cpu);
 #if defined(CONFIG_SMP)
-        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+        if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
@@ -2848,19 +1716,23 @@ void sched_fork(struct task_struct *p)
        p->state = TASK_RUNNING;
        /*
+         * Make sure we do not leak PI boosting priority to the child.
+         */
+        p->prio = current->normal_prio;
+        /*
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-                if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+                if (task_has_rt_policy(p)) {
                        p->policy = SCHED_NORMAL;
-                        p->normal_prio = p->static_prio;
-                }
-                if (PRIO_TO_NICE(p->static_prio) < 0) {
                        p->static_prio = NICE_TO_PRIO(0);
-                        p->normal_prio = p->static_prio;
+                        p->rt_priority = 0;
-                        set_load_weight(p);
+                } else if (PRIO_TO_NICE(p->static_prio) < 0)
-                }
+                        p->static_prio = NICE_TO_PRIO(0);
+                p->prio = p->normal_prio = __normal_prio(p);
+                set_load_weight(p);
                /*
                 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +1741,6 @@ void sched_fork(struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
-        /*
-         * Make sure we do not leak PI boosting priority to the child.
-         */
-        p->prio = current->normal_prio;
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
@@ -3070,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
+        trace_sched_stat_sleeptime(current, rq->clock);
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -3305,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 */
 static atomic_long_t calc_load_tasks_idle;
-static void calc_load_account_idle(struct rq *this_rq)
+void calc_load_account_idle(struct rq *this_rq)
 {
        long delta;
@@ -3449,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks)
         */
 }
 #else
-static void calc_load_account_idle(struct rq *this_rq)
+void calc_load_account_idle(struct rq *this_rq)
 {
 }
@@ -3592,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
-static void update_cpu_load(struct rq *this_rq)
+void update_cpu_load(struct rq *this_rq)
 {
        unsigned long this_load = this_rq->load.weight;
        unsigned long curr_jiffies = jiffies;
@@ -3670,8 +2538,10 @@ unlock:
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 /*
 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3724,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        return ns;
 }
+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+static inline void task_group_account_field(struct task_struct *p, int index,
+                                            u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+        struct kernel_cpustat *kcpustat;
+        struct cpuacct *ca;
+#endif
+        /*
+         * Since all updates are sure to touch the root cgroup, we
+         * get ourselves ahead and touch it first. If the root cgroup
+         * is the only cgroup, then nothing else should be necessary.
+         *
+         */
+        __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+#ifdef CONFIG_CGROUP_CPUACCT
+        if (unlikely(!cpuacct_subsys.active))
+                return;
+        rcu_read_lock();
+        ca = task_ca(p);
+        while (ca && (ca != &root_cpuacct)) {
+                kcpustat = this_cpu_ptr(ca->cpustat);
+                kcpustat->cpustat[index] += tmp;
+                ca = parent_ca(ca);
+        }
+        rcu_read_unlock();
+#endif
+}
 /*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
@@ -3733,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 void account_user_time(struct task_struct *p, cputime_t cputime,
                       cputime_t cputime_scaled)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        int index;
-        cputime64_t tmp;
        /* Add user time to process. */
-        p->utime = cputime_add(p->utime, cputime);
+        p->utime += cputime;
-        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
+        index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
        /* Add user time to cpustat. */
-        tmp = cputime_to_cputime64(cputime);
+        task_group_account_field(p, index, (__force u64) cputime);
-        if (TASK_NICE(p) > 0)
-                cpustat->nice = cputime64_add(cpustat->nice, tmp);
-        else
-                cpustat->user = cputime64_add(cpustat->user, tmp);
-        cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
        /* Account for user time used */
        acct_update_integrals(p);
 }
@@ -3762,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 static void account_guest_time(struct task_struct *p, cputime_t cputime,
                               cputime_t cputime_scaled)
 {
-        cputime64_t tmp;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-        tmp = cputime_to_cputime64(cputime);
        /* Add guest time to process. */
-        p->utime = cputime_add(p->utime, cputime);
+        p->utime += cputime;
-        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
-        p->gtime = cputime_add(p->gtime, cputime);
+        p->gtime += cputime;
        /* Add guest time to cpustat. */
        if (TASK_NICE(p) > 0) {
-                cpustat->nice = cputime64_add(cpustat->nice, tmp);
+                cpustat[CPUTIME_NICE] += (__force u64) cputime;
-                cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
-                cpustat->user = cputime64_add(cpustat->user, tmp);
+                cpustat[CPUTIME_USER] += (__force u64) cputime;
-                cpustat->guest = cputime64_add(cpustat->guest, tmp);
+                cpustat[CPUTIME_GUEST] += (__force u64) cputime;
        }
 }
@@ -3792,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 */
 static inline
 void __account_system_time(struct task_struct *p, cputime_t cputime,
-                        cputime_t cputime_scaled, cputime64_t *target_cputime64)
+                        cputime_t cputime_scaled, int index)
 {
-        cputime64_t tmp = cputime_to_cputime64(cputime);
        /* Add system time to process. */
-        p->stime = cputime_add(p->stime, cputime);
+        p->stime += cputime;
-        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+        p->stimescaled += cputime_scaled;
        account_group_system_time(p, cputime);
        /* Add system time to cpustat. */
-        *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+        task_group_account_field(p, index, (__force u64) cputime);
-        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
        /* Account for system time used */
        acct_update_integrals(p);
@@ -3819,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 void account_system_time(struct task_struct *p, int hardirq_offset,
                         cputime_t cputime, cputime_t cputime_scaled)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        int index;
-        cputime64_t *target_cputime64;
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                account_guest_time(p, cputime, cputime_scaled);
@@ -3828,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        }
        if (hardirq_count() - hardirq_offset)
-                target_cputime64 = &cpustat->irq;
+                index = CPUTIME_IRQ;
        else if (in_serving_softirq())
-                target_cputime64 = &cpustat->softirq;
+                index = CPUTIME_SOFTIRQ;
        else
-                target_cputime64 = &cpustat->system;
+                index = CPUTIME_SYSTEM;
-        __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+        __account_system_time(p, cputime, cputime_scaled, index);
 }
 /*
@@ -3843,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 */
 void account_steal_time(cputime_t cputime)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        cputime64_t cputime64 = cputime_to_cputime64(cputime);
-        cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+        cpustat[CPUTIME_STEAL] += (__force u64) cputime;
 }
 /*
@@ -3855,14 +2749,13 @@ void account_steal_time(cputime_t cputime)
 */
 void account_idle_time(cputime_t cputime)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        cputime64_t cputime64 = cputime_to_cputime64(cputime);
        struct rq *rq = this_rq();
        if (atomic_read(&rq->nr_iowait) > 0)
-                cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+                cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
        else
-                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 static __always_inline bool steal_account_process_tick(void)
@@ -3912,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                                struct rq *rq)
 {
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
        if (steal_account_process_tick())
                return;
        if (irqtime_account_hi_update()) {
-                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+                cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
        } else if (irqtime_account_si_update()) {
-                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+                cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -3929,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                 * Also, p->stime needs to be updated for ksoftirqd.
                 */
                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                        &cpustat->softirq);
+                                        CPUTIME_SOFTIRQ);
        } else if (user_tick) {
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        } else if (p == rq->idle) {
@@ -3938,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
        } else {
                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                        &cpustat->system);
+                                        CPUTIME_SYSTEM);
        }
 }
@@ -4037,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-        cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
+        cputime_t rtime, utime = p->utime, total = utime + p->stime;
        /*
         * Use CFS's precise accounting:
@@ -4045,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
        if (total) {
-                u64 temp = rtime;
+                u64 temp = (__force u64) rtime;
-                temp *= utime;
+                temp *= (__force u64) utime;
-                do_div(temp, total);
+                do_div(temp, (__force u32) total);
-                utime = (cputime_t)temp;
+                utime = (__force cputime_t) temp;
        } else
                utime = rtime;
@@ -4057,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         * Compare with previous values, to keep monotonicity:
         */
        p->prev_utime = max(p->prev_utime, utime);
-        p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+        p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
        *ut = p->prev_utime;
        *st = p->prev_stime;
@@ -4074,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        thread_group_cputime(p, &cputime);
-        total = cputime_add(cputime.utime, cputime.stime);
+        total = cputime.utime + cputime.stime;
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
        if (total) {
-                u64 temp = rtime;
+                u64 temp = (__force u64) rtime;
-                temp *= cputime.utime;
+                temp *= (__force u64) cputime.utime;
-                do_div(temp, total);
+                do_div(temp, (__force u32) total);
-                utime = (cputime_t)temp;
+                utime = (__force cputime_t) temp;
        } else
                utime = rtime;
        sig->prev_utime = max(sig->prev_utime, utime);
-        sig->prev_stime = max(sig->prev_stime,
+        sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
-                              cputime_sub(rtime, sig->prev_utime));
        *ut = sig->prev_utime;
        *st = sig->prev_stime;
@@ -4116,7 +3007,7 @@ void scheduler_tick(void)
        perf_event_task_tick();
 #ifdef CONFIG_SMP
-        rq->idle_at_tick = idle_cpu(cpu);
+        rq->idle_balance = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
 #endif
 }
@@ -4187,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
 {
        struct pt_regs *regs = get_irq_regs();
+        if (oops_in_progress)
+                return;
        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
                prev->comm, prev->pid, preempt_count());
@@ -4213,6 +3107,7 @@ static inline void schedule_debug(struct task_struct *prev)
         */
        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
                __schedule_bug(prev);
+        rcu_sleep_check();
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4239,7 +3134,7 @@ pick_next_task(struct rq *rq)
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-        if (likely(rq->nr_running == rq->cfs.nr_running)) {
+        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
@@ -4676,6 +3571,9 @@ EXPORT_SYMBOL(wait_for_completion);
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. The timeout is in jiffies. It is not
 * interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
 */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4690,6 +3588,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
 *
 * This waits for completion of a specific task to be signaled. It is
 * interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
@@ -4707,6 +3607,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 *
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4722,6 +3625,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 *
 * This waits to be signaled for completion of a specific task. It can be
 * interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_killable(struct completion *x)
 {
@@ -4740,6 +3645,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * This waits for either a completion of a specific task to be
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_killable_timeout(struct completion *x,
@@ -5025,7 +3933,20 @@ EXPORT_SYMBOL(task_nice);
 */
 int idle_cpu(int cpu)
 {
-        return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+        struct rq *rq = cpu_rq(cpu);
+        if (rq->curr != rq->idle)
+                return 0;
+        if (rq->nr_running)
+                return 0;
+#ifdef CONFIG_SMP
+        if (!llist_empty(&rq->wake_list))
+                return 0;
+#endif
+        return 1;
 }
 /**
@@ -5691,6 +4612,13 @@ again:
                 */
                if (preempt && rq != p_rq)
                        resched_task(p_rq->curr);
+        } else {
+                /*
+                 * We might have set it in task_yield_fair(), but are
+                 * not going to schedule(), so don't want to skip
+                 * the next update.
+                 */
+                rq->skip_clock_update = 0;
        }
 out:
@@ -5858,7 +4786,7 @@ void sched_show_task(struct task_struct *p)
        free = stack_not_used(p);
 #endif
        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), task_pid_nr(p->real_parent),
+                task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
                (unsigned long)task_thread_info(p)->flags);
        show_stack(p, NULL);
@@ -5875,7 +4803,7 @@ void show_state_filter(unsigned long state_filter)
        printk(KERN_INFO
                "  task                        PC stack   pid father\n");
 #endif
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        do_each_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
@@ -5891,7 +4819,7 @@ void show_state_filter(unsigned long state_filter)
 #ifdef CONFIG_SCHED_DEBUG
        sysrq_sched_debug_show();
 #endif
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        /*
         * Only show locks if all tasks are dumped:
         */
@@ -5952,62 +4880,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
-}
+#if defined(CONFIG_SMP)
+        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-/*
+#endif
- * In a system that switches off the HZ timer nohz_cpu_mask
- * indicates which cpus entered this state. This is used
- * in the rcu update to wait only for active cpus. For system
- * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_BITS_NONE.
- */
-cpumask_var_t nohz_cpu_mask;
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static int get_update_sysctl_factor(void)
-{
-        unsigned int cpus = min_t(int, num_online_cpus(), 8);
-        unsigned int factor;
-        switch (sysctl_sched_tunable_scaling) {
-        case SCHED_TUNABLESCALING_NONE:
-                factor = 1;
-                break;
-        case SCHED_TUNABLESCALING_LINEAR:
-                factor = cpus;
-                break;
-        case SCHED_TUNABLESCALING_LOG:
-        default:
-                factor = 1 + ilog2(cpus);
-                break;
-        }
-        return factor;
-}
-static void update_sysctl(void)
-{
-        unsigned int factor = get_update_sysctl_factor();
-#define SET_SYSCTL(name) \
-        (sysctl_##name = (factor) * normalized_sysctl_##name)
-        SET_SYSCTL(sched_min_granularity);
-        SET_SYSCTL(sched_latency);
-        SET_SYSCTL(sched_wakeup_granularity);
-#undef SET_SYSCTL
-}
-static inline void sched_init_granularity(void)
-{
-        update_sysctl();
 }
 #ifdef CONFIG_SMP
@@ -6015,10 +4890,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
        if (p->sched_class && p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
-        else {
-                cpumask_copy(&p->cpus_allowed, new_mask);
+        cpumask_copy(&p->cpus_allowed, new_mask);
-                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-        }
 }
 /*
@@ -6116,7 +4990,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
-        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                goto fail;
        /*
@@ -6222,6 +5096,9 @@ static void migrate_tasks(unsigned int dead_cpu)
         */
        rq->stop = NULL;
+        /* Ensure any throttled groups are reachable by pick_next_task */
+        unthrottle_offline_cfs_rqs(rq);
        for ( ; ; ) {
                /*
                 * There's this thread running, bail when that's the only
@@ -6299,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 static void
 set_table_entry(struct ctl_table *entry,
                const char *procname, void *data, int maxlen,
-                mode_t mode, proc_handler *proc_handler)
+                umode_t mode, proc_handler *proc_handler)
 {
        entry->procname = procname;
        entry->data = data;
@@ -6799,6 +5676,12 @@ out:
        return -ENOMEM;
 }
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
 static void init_defrootdomain(void)
 {
        init_rootdomain(&def_root_domain);
@@ -6870,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 }
 /*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see ttwu_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_id);
+static void update_top_cache_domain(int cpu)
+{
+        struct sched_domain *sd;
+        int id = cpu;
+        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+        if (sd)
+                id = cpumask_first(sched_domain_span(sd));
+        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+        per_cpu(sd_llc_id, cpu) = id;
+}
+/*
 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
 * hold the hotplug lock.
 */
@@ -6908,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
        destroy_sched_domains(tmp, cpu);
+        update_top_cache_domain(cpu);
 }
 /* cpus with isolated domains */
@@ -6923,8 +5833,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-#define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
 /**
@@ -7069,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                GFP_KERNEL, cpu_to_node(i));
+                                GFP_KERNEL, cpu_to_node(cpu));
                if (!sg)
                        goto fail;
@@ -7207,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                return;
        update_group_power(sd, cpu);
+        atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+}
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
 }
 /*
@@ -7761,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                           struct sysdev_class_attribute *attr,
+                                           struct device_attribute *attr,
-                                           char *page)
+                                           char *buf)
 {
-        return sprintf(page, "%u\n", sched_mc_power_savings);
+        return sprintf(buf, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                            struct sysdev_class_attribute *attr,
+                                            struct device_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                         sched_mc_power_savings_show,
+                   sched_mc_power_savings_show,
-                         sched_mc_power_savings_store);
+                   sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                            struct sysdev_class_attribute *attr,
+                                            struct device_attribute *attr,
-                                            char *page)
+                                            char *buf)
 {
-        return sprintf(page, "%u\n", sched_smt_power_savings);
+        return sprintf(buf, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                             struct sysdev_class_attribute *attr,
+                                            struct device_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
                   sched_smt_power_savings_show,
                   sched_smt_power_savings_store);
 #endif
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
 {
        int err = 0;
 #ifdef CONFIG_SCHED_SMT
        if (smt_capable())
-                err = sysfs_create_file(&cls->kset.kobj,
+                err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-                                        &attr_sched_smt_power_savings.attr);
 #endif
 #ifdef CONFIG_SCHED_MC
        if (!err && mc_capable())
-                err = sysfs_create_file(&cls->kset.kobj,
+                err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-                                        &attr_sched_mc_power_savings.attr);
 #endif
        return err;
 }
@@ -7844,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
        }
 }
-static int update_runtime(struct notifier_block *nfb,
-                                unsigned long action, void *hcpu)
-{
-        int cpu = (int)(long)hcpu;
-        switch (action) {
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-                disable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                enable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
-        }
-}
 void __init sched_init_smp(void)
 {
        cpumask_var_t non_isolated_cpus;
@@ -7915,103 +6804,11 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
-static void init_cfs_rq(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_CGROUP_SCHED
-{
+struct task_group root_task_group;
-        cfs_rq->tasks_timeline = RB_ROOT;
-        INIT_LIST_HEAD(&cfs_rq->tasks);
-        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
-        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
-static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
-{
-        struct rt_prio_array *array;
-        int i;
-        array = &rt_rq->active;
-        for (i = 0; i < MAX_RT_PRIO; i++) {
-                INIT_LIST_HEAD(array->queue + i);
-                __clear_bit(i, array->bitmap);
-        }
-        /* delimiter for bitsearch: */
-        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP
-        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        rt_rq->highest_prio.next = MAX_RT_PRIO;
-        rt_rq->rt_nr_migratory = 0;
-        rt_rq->overloaded = 0;
-        plist_head_init(&rt_rq->pushable_tasks);
-#endif
-        rt_rq->rt_time = 0;
-        rt_rq->rt_throttled = 0;
-        rt_rq->rt_runtime = 0;
-        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                                struct sched_entity *se, int cpu,
-                                struct sched_entity *parent)
-{
-        struct rq *rq = cpu_rq(cpu);
-        cfs_rq->tg = tg;
-        cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-        /* allow initial update_cfs_load() to truncate */
-        cfs_rq->load_stamp = 1;
 #endif
-        tg->cfs_rq[cpu] = cfs_rq;
+DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-        tg->se[cpu] = se;
-        /* se could be NULL for root_task_group */
-        if (!se)
-                return;
-        if (!parent)
-                se->cfs_rq = &rq->cfs;
-        else
-                se->cfs_rq = parent->my_q;
-        se->my_q = cfs_rq;
-        update_load_set(&se->load, 0);
-        se->parent = parent;
-}
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-                struct sched_rt_entity *rt_se, int cpu,
-                struct sched_rt_entity *parent)
-{
-        struct rq *rq = cpu_rq(cpu);
-        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        rt_rq->rt_nr_boosted = 0;
-        rt_rq->rq = rq;
-        rt_rq->tg = tg;
-        tg->rt_rq[cpu] = rt_rq;
-        tg->rt_se[cpu] = rt_se;
-        if (!rt_se)
-                return;
-        if (!parent)
-                rt_se->rt_rq = &rq->rt;
-        else
-                rt_se->rt_rq = parent->my_q;
-        rt_se->my_q = rt_rq;
-        rt_se->parent = parent;
-        INIT_LIST_HEAD(&rt_se->run_list);
-}
-#endif
 void __init sched_init(void)
 {
@@ -8069,9 +6866,17 @@ void __init sched_init(void)
 #ifdef CONFIG_CGROUP_SCHED
        list_add(&root_task_group.list, &task_groups);
        INIT_LIST_HEAD(&root_task_group.children);
+        INIT_LIST_HEAD(&root_task_group.siblings);
        autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
+#ifdef CONFIG_CGROUP_CPUACCT
+        root_cpuacct.cpustat = &kernel_cpustat;
+        root_cpuacct.cpuusage = alloc_percpu(u64);
+        /* Too early, not expected to fail */
+        BUG_ON(!root_cpuacct.cpuusage);
+#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -8083,7 +6888,7 @@ void __init sched_init(void)
                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                root_task_group.shares = root_task_group_load;
+                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                /*
                 * How much cpu bandwidth does root_task_group get?
@@ -8104,6 +6909,7 @@ void __init sched_init(void)
                 * We achieve this by letting root_task_group's tasks sit
                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
+                init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8132,8 +6938,7 @@ void __init sched_init(void)
                rq->avg_idle = 2*sysctl_sched_migration_cost;
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
-                rq->nohz_balance_kick = 0;
+                rq->nohz_flags = 0;
-                init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
 #endif
 #endif
                init_rq_hrtick(rq);
@@ -8146,10 +6951,6 @@ void __init sched_init(void)
        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
-#ifdef CONFIG_SMP
-        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#endif
 #ifdef CONFIG_RT_MUTEXES
        plist_head_init(&init_task.pi_waiters);
 #endif
@@ -8175,21 +6976,13 @@ void __init sched_init(void)
         */
        current->sched_class = &fair_sched_class;
-        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
-#ifdef CONFIG_NO_HZ
-        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
-        atomic_set(&nohz.load_balancer, nr_cpu_ids);
-        atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
-        atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
-#endif
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
-#endif /* SMP */
+#endif
+        init_sched_fair_class();
        scheduler_running = 1;
 }
@@ -8206,6 +6999,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 {
        static unsigned long prev_jiffy;        /* ratelimiting */
+        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
            system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
@@ -8340,165 +7134,10 @@ void set_curr_task(int cpu, struct task_struct *p)
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void free_fair_sched_group(struct task_group *tg)
-{
-        int i;
-        for_each_possible_cpu(i) {
-                if (tg->cfs_rq)
-                        kfree(tg->cfs_rq[i]);
-                if (tg->se)
-                        kfree(tg->se[i]);
-        }
-        kfree(tg->cfs_rq);
-        kfree(tg->se);
-}
-static
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se;
-        int i;
-        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->cfs_rq)
-                goto err;
-        tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->se)
-                goto err;
-        tg->shares = NICE_0_LOAD;
-        for_each_possible_cpu(i) {
-                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
-                                      GFP_KERNEL, cpu_to_node(i));
-                if (!cfs_rq)
-                        goto err;
-                se = kzalloc_node(sizeof(struct sched_entity),
-                                  GFP_KERNEL, cpu_to_node(i));
-                if (!se)
-                        goto err_free_rq;
-                init_cfs_rq(cfs_rq);
-                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
-        }
-        return 1;
-err_free_rq:
-        kfree(cfs_rq);
-err:
-        return 0;
-}
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        /*
-        * Only empty task groups can be destroyed; so we can speculatively
-        * check on_list without danger of it being re-added.
-        */
-        if (!tg->cfs_rq[cpu]->on_list)
-                return;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline void free_fair_sched_group(struct task_group *tg)
-{
-}
-static inline
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        return 1;
-}
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg)
-{
-        int i;
-        if (tg->rt_se)
-                destroy_rt_bandwidth(&tg->rt_bandwidth);
-        for_each_possible_cpu(i) {
-                if (tg->rt_rq)
-                        kfree(tg->rt_rq[i]);
-                if (tg->rt_se)
-                        kfree(tg->rt_se[i]);
-        }
-        kfree(tg->rt_rq);
-        kfree(tg->rt_se);
-}
-static
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        struct rt_rq *rt_rq;
-        struct sched_rt_entity *rt_se;
-        int i;
-        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->rt_rq)
-                goto err;
-        tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->rt_se)
-                goto err;
-        init_rt_bandwidth(&tg->rt_bandwidth,
-                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
-        for_each_possible_cpu(i) {
-                rt_rq = kzalloc_node(sizeof(struct rt_rq),
-                                     GFP_KERNEL, cpu_to_node(i));
-                if (!rt_rq)
-                        goto err;
-                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
-                                     GFP_KERNEL, cpu_to_node(i));
-                if (!rt_se)
-                        goto err_free_rq;
-                init_rt_rq(rt_rq, cpu_rq(i));
-                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
-        }
-        return 1;
-err_free_rq:
-        kfree(rt_rq);
-err:
-        return 0;
-}
-#else /* !CONFIG_RT_GROUP_SCHED */
-static inline void free_rt_sched_group(struct task_group *tg)
-{
-}
-static inline
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        return 1;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
 static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
@@ -8603,47 +7242,13 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static DEFINE_MUTEX(shares_mutex);
+static unsigned long to_ratio(u64 period, u64 runtime)
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
-        int i;
+        if (runtime == RUNTIME_INF)
-        unsigned long flags;
+                return 1ULL << 20;
-        /*
-         * We can't change the weight of the root cgroup.
-         */
-        if (!tg->se[0])
-                return -EINVAL;
-        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-        mutex_lock(&shares_mutex);
-        if (tg->shares == shares)
-                goto done;
-        tg->shares = shares;
-        for_each_possible_cpu(i) {
-                struct rq *rq = cpu_rq(i);
-                struct sched_entity *se;
-                se = tg->se[i];
-                /* Propagate contribution to hierarchy */
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                for_each_sched_entity(se)
-                        update_cfs_shares(group_cfs_rq(se));
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-done:
-        mutex_unlock(&shares_mutex);
-        return 0;
-}
-unsigned long sched_group_shares(struct task_group *tg)
+        return div64_u64(runtime << 20, period);
-{
-        return tg->shares;
 }
 #endif
@@ -8653,21 +7258,13 @@ unsigned long sched_group_shares(struct task_group *tg)
 */
 static DEFINE_MUTEX(rt_constraints_mutex);
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-        if (runtime == RUNTIME_INF)
-                return 1ULL << 20;
-        return div64_u64(runtime << 20, period);
-}
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
        struct task_struct *g, *p;
        do_each_thread(g, p) {
-                if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                if (rt_task(p) && task_rq(p)->rt.tg == tg)
                        return 1;
        } while_each_thread(g, p);
@@ -8680,7 +7277,7 @@ struct rt_schedulable_data {
        u64 rt_runtime;
 };
-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
        struct rt_schedulable_data *d = data;
        struct task_group *child;
@@ -8738,16 +7335,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
+        int ret;
        struct rt_schedulable_data data = {
                .tg = tg,
                .rt_period = period,
                .rt_runtime = runtime,
        };
-        return walk_tg_tree(tg_schedulable, tg_nop, &data);
+        rcu_read_lock();
+        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+        rcu_read_unlock();
+        return ret;
 }
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
                u64 rt_period, u64 rt_runtime)
 {
        int i, err = 0;
@@ -8786,7 +7389,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
        if (rt_runtime_us < 0)
                rt_runtime = RUNTIME_INF;
-        return tg_set_bandwidth(tg, rt_period, rt_runtime);
+        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_runtime(struct task_group *tg)
@@ -8811,7 +7414,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
        if (rt_period == 0)
                return -EINVAL;
-        return tg_set_bandwidth(tg, rt_period, rt_runtime);
+        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_period(struct task_group *tg)
@@ -8953,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
        sched_destroy_group(tg);
 }
-static int
+static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+                                 struct cgroup_taskset *tset)
 {
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, cgrp, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
-        if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
+                if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
-                return -EINVAL;
+                        return -EINVAL;
 #else
-        /* We don't support RT-tasks being in separate groups */
+                /* We don't support RT-tasks being in separate groups */
-        if (tsk->sched_class != &fair_sched_class)
+                if (task->sched_class != &fair_sched_class)
-                return -EINVAL;
+                        return -EINVAL;
 #endif
+        }
        return 0;
 }
-static void
+static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+                              struct cgroup_taskset *tset)
 {
-        sched_move_task(tsk);
+        struct task_struct *task;
+        cgroup_taskset_for_each(task, cgrp, tset)
+                sched_move_task(task);
 }
 static void
@@ -9001,6 +7611,237 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
        return (u64) scale_load_down(tg->shares);
 }
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+        int i, ret = 0, runtime_enabled, runtime_was_enabled;
+        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+        if (tg == &root_task_group)
+                return -EINVAL;
+        /*
+         * Ensure we have at some amount of bandwidth every period.  This is
+         * to prevent reaching a state of large arrears when throttled via
+         * entity_tick() resulting in prolonged exit starvation.
+         */
+        if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+                return -EINVAL;
+        /*
+         * Likewise, bound things on the otherside by preventing insane quota
+         * periods.  This also allows us to normalize in computing quota
+         * feasibility.
+         */
+        if (period > max_cfs_quota_period)
+                return -EINVAL;
+        mutex_lock(&cfs_constraints_mutex);
+        ret = __cfs_schedulable(tg, period, quota);
+        if (ret)
+                goto out_unlock;
+        runtime_enabled = quota != RUNTIME_INF;
+        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+        account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+        raw_spin_lock_irq(&cfs_b->lock);
+        cfs_b->period = ns_to_ktime(period);
+        cfs_b->quota = quota;
+        __refill_cfs_bandwidth_runtime(cfs_b);
+        /* restart the period timer (if active) to handle new period expiry */
+        if (runtime_enabled && cfs_b->timer_active) {
+                /* force a reprogram */
+                cfs_b->timer_active = 0;
+                __start_cfs_bandwidth(cfs_b);
+        }
+        raw_spin_unlock_irq(&cfs_b->lock);
+        for_each_possible_cpu(i) {
+                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+                struct rq *rq = cfs_rq->rq;
+                raw_spin_lock_irq(&rq->lock);
+                cfs_rq->runtime_enabled = runtime_enabled;
+                cfs_rq->runtime_remaining = 0;
+                if (cfs_rq->throttled)
+                        unthrottle_cfs_rq(cfs_rq);
+                raw_spin_unlock_irq(&rq->lock);
+        }
+out_unlock:
+        mutex_unlock(&cfs_constraints_mutex);
+        return ret;
+}
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+        u64 quota, period;
+        period = ktime_to_ns(tg->cfs_bandwidth.period);
+        if (cfs_quota_us < 0)
+                quota = RUNTIME_INF;
+        else
+                quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+        return tg_set_cfs_bandwidth(tg, period, quota);
+}
+long tg_get_cfs_quota(struct task_group *tg)
+{
+        u64 quota_us;
+        if (tg->cfs_bandwidth.quota == RUNTIME_INF)
+                return -1;
+        quota_us = tg->cfs_bandwidth.quota;
+        do_div(quota_us, NSEC_PER_USEC);
+        return quota_us;
+}
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+        u64 quota, period;
+        period = (u64)cfs_period_us * NSEC_PER_USEC;
+        quota = tg->cfs_bandwidth.quota;
+        return tg_set_cfs_bandwidth(tg, period, quota);
+}
+long tg_get_cfs_period(struct task_group *tg)
+{
+        u64 cfs_period_us;
+        cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+        do_div(cfs_period_us, NSEC_PER_USEC);
+        return cfs_period_us;
+}
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+        return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+                                s64 cfs_quota_us)
+{
+        return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+        return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+                                u64 cfs_period_us)
+{
+        return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+struct cfs_schedulable_data {
+        struct task_group *tg;
+        u64 period, quota;
+};
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+                               struct cfs_schedulable_data *d)
+{
+        u64 quota, period;
+        if (tg == d->tg) {
+                period = d->period;
+                quota = d->quota;
+        } else {
+                period = tg_get_cfs_period(tg);
+                quota = tg_get_cfs_quota(tg);
+        }
+        /* note: these should typically be equivalent */
+        if (quota == RUNTIME_INF || quota == -1)
+                return RUNTIME_INF;
+        return to_ratio(period, quota);
+}
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+        struct cfs_schedulable_data *d = data;
+        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+        s64 quota = 0, parent_quota = -1;
+        if (!tg->parent) {
+                quota = RUNTIME_INF;
+        } else {
+                struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
+                quota = normalize_cfs_quota(tg, d);
+                parent_quota = parent_b->hierarchal_quota;
+                /*
+                 * ensure max(child_quota) <= parent_quota, inherit when no
+                 * limit is set
+                 */
+                if (quota == RUNTIME_INF)
+                        quota = parent_quota;
+                else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+                        return -EINVAL;
+        }
+        cfs_b->hierarchal_quota = quota;
+        return 0;
+}
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+        int ret;
+        struct cfs_schedulable_data data = {
+                .tg = tg,
+                .period = period,
+                .quota = quota,
+        };
+        if (quota != RUNTIME_INF) {
+                do_div(data.period, NSEC_PER_USEC);
+                do_div(data.quota, NSEC_PER_USEC);
+        }
+        rcu_read_lock();
+        ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+        rcu_read_unlock();
+        return ret;
+}
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                struct cgroup_map_cb *cb)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+        cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+        cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+        return 0;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9035,6 +7876,22 @@ static struct cftype cpu_files[] = {
                .write_u64 = cpu_shares_write_u64,
        },
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        {
+                .name = "cfs_quota_us",
+                .read_s64 = cpu_cfs_quota_read_s64,
+                .write_s64 = cpu_cfs_quota_write_s64,
+        },
+        {
+                .name = "cfs_period_us",
+                .read_u64 = cpu_cfs_period_read_u64,
+                .write_u64 = cpu_cfs_period_write_u64,
+        },
+        {
+                .name = "stat",
+                .read_map = cpu_stats_show,
+        },
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
        {
                .name = "rt_runtime_us",
@@ -9058,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-        .can_attach_task = cpu_cgroup_can_attach_task,
+        .can_attach     = cpu_cgroup_can_attach,
-        .attach_task    = cpu_cgroup_attach_task,
+        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
@@ -9077,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 * (balbir@in.ibm.com).
 */
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-        struct cgroup_subsys_state css;
-        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 __percpu *cpuusage;
-        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
-        struct cpuacct *parent;
-};
-struct cgroup_subsys cpuacct_subsys;
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
-        struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        struct cpuacct *ca;
-        int i;
+        if (!cgrp->parent)
+                return &root_cpuacct.css;
+        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
        if (!ca)
                goto out;
@@ -9116,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create(
        if (!ca->cpuusage)
                goto out_free_ca;
-        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+        ca->cpustat = alloc_percpu(struct kernel_cpustat);
-                if (percpu_counter_init(&ca->cpustat[i], 0))
+        if (!ca->cpustat)
-                        goto out_free_counters;
+                goto out_free_cpuusage;
-        if (cgrp->parent)
-                ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
-out_free_counters:
+out_free_cpuusage:
-        while (--i >= 0)
-                percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
 out_free_ca:
        kfree(ca);
@@ -9140,10 +7970,8 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
-        int i;
-        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+        free_percpu(ca->cpustat);
-                percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
        kfree(ca);
 }
@@ -9236,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = {
 };
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                struct cgroup_map_cb *cb)
+                              struct cgroup_map_cb *cb)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
-        int i;
+        int cpu;
+        s64 val = 0;
+        for_each_online_cpu(cpu) {
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                val += kcpustat->cpustat[CPUTIME_USER];
+                val += kcpustat->cpustat[CPUTIME_NICE];
+        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-        for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+        val = 0;
-                s64 val = percpu_counter_read(&ca->cpustat[i]);
+        for_each_online_cpu(cpu) {
-                val = cputime64_to_clock_t(val);
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                cb->fill(cb, cpuacct_stat_desc[i], val);
+                val += kcpustat->cpustat[CPUTIME_SYSTEM];
+                val += kcpustat->cpustat[CPUTIME_IRQ];
+                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
        return 0;
 }
@@ -9275,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 *
 * called with rq->lock held.
 */
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
        int cpu;
@@ -9289,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        ca = task_ca(tsk);
-        for (; ca; ca = ca->parent) {
+        for (; ca; ca = parent_ca(ca)) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
@@ -9297,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        rcu_read_unlock();
 }
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH   \
-        min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH   0
-#endif
-/*
- * Charge the system/user time to the task's accounting group.
- */
-static void cpuacct_update_stats(struct task_struct *tsk,
-                enum cpuacct_stat_index idx, cputime_t val)
-{
-        struct cpuacct *ca;
-        int batch = CPUACCT_BATCH;
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        rcu_read_lock();
-        ca = task_ca(tsk);
-        do {
-                __percpu_counter_add(&ca->cpustat[idx], val, batch);
-                ca = ca->parent;
-        } while (ca);
-        rcu_read_unlock();
-}
 struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
        .create = cpuacct_create,
@@ -9344,4 +8148,3 @@ struct cgroup_subsys cpuacct_subsys = {
        .subsys_id = cpuacct_subsys_id,
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index 2722dc1b4138..b0d798eaf130 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
 /*
- *  kernel/sched_cpupri.c
+ *  kernel/sched/cpupri.c
 *
 *  CPU priority management
 *
@@ -28,7 +28,7 @@
 */
 #include <linux/gfp.h>
-#include "sched_cpupri.h"
+#include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
 static int convert_prio(int prio)
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
        return cpupri;
 }
-#define for_each_cpupri_active(array, idx)                    \
-        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
        int                  idx      = 0;
        int                  task_pri = convert_prio(p->prio);
-        for_each_cpupri_active(cp->pri_active, idx) {
+        if (task_pri >= MAX_RT_PRIO)
-                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                return 0;
-                if (idx >= task_pri)
+        for (idx = 0; idx < task_pri; idx++) {
-                        break;
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                int skip = 0;
+                if (!atomic_read(&(vec)->count))
+                        skip = 1;
+                /*
+                 * When looking at the vector, we need to read the counter,
+                 * do a memory barrier, then read the mask.
+                 *
+                 * Note: This is still all racey, but we can deal with it.
+                 *  Ideally, we only want to look at masks that are set.
+                 *
+                 *  If a mask is not set, then the only thing wrong is that we
+                 *  did a little more work than necessary.
+                 *
+                 *  If we read a zero count but the mask is set, because of the
+                 *  memory barriers, that can only happen when the highest prio
+                 *  task for a run queue has left the run queue, in which case,
+                 *  it will be followed by a pull. If the task we are processing
+                 *  fails to find a proper place to go, that pull request will
+                 *  pull this task if the run queue is running at a lower
+                 *  priority.
+                 */
+                smp_rmb();
+                /* Need to do the rmb for every iteration */
+                if (skip)
+                        continue;
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
        int                 *currpri = &cp->cpu_to_pri[cpu];
        int                  oldpri  = *currpri;
-        unsigned long        flags;
+        int                  do_mb = 0;
        newpri = convert_prio(newpri);
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
         * If the cpu was currently mapped to a different value, we
         * need to map it to the new value then remove the old value.
         * Note, we must add the new value first, otherwise we risk the
-         * cpu being cleared from pri_active, and this cpu could be
+         * cpu being missed by the priority loop in cpupri_find.
-         * missed for a push or pull.
         */
        if (likely(newpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
                cpumask_set_cpu(cpu, vec->mask);
-                vec->count++;
+                /*
-                if (vec->count == 1)
+                 * When adding a new vector, we update the mask first,
-                        set_bit(newpri, cp->pri_active);
+                 * do a write memory barrier, and then update the count, to
+                 * make sure the vector is visible when count is set.
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
+                 */
+                smp_mb__before_atomic_inc();
+                atomic_inc(&(vec)->count);
+                do_mb = 1;
        }
        if (likely(oldpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
+                /*
+                 * Because the order of modification of the vec->count
-                vec->count--;
+                 * is important, we must make sure that the update
-                if (!vec->count)
+                 * of the new prio is seen before we decrement the
-                        clear_bit(oldpri, cp->pri_active);
+                 * old prio. This makes sure that the loop sees
+                 * one or the other when we raise the priority of
+                 * the run queue. We don't care about when we lower the
+                 * priority, as that will trigger an rt pull anyway.
+                 *
+                 * We only need to do a memory barrier if we updated
+                 * the new priority vec.
+                 */
+                if (do_mb)
+                        smp_mb__after_atomic_inc();
+                /*
+                 * When removing from the vector, we decrement the counter first
+                 * do a memory barrier and then clear the mask.
+                 */
+                atomic_dec(&(vec)->count);
+                smp_mb__after_atomic_inc();
                cpumask_clear_cpu(cpu, vec->mask);
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-                raw_spin_lock_init(&vec->lock);
+                atomic_set(&vec->count, 0);
-                vec->count = 0;
                if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
                        goto cleanup;
        }
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index 9fc7d386fea4..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -4,7 +4,6 @@
 #include <linux/sched.h>
 #define CPUPRI_NR_PRIORITIES    (MAX_RT_PRIO + 2)
-#define CPUPRI_NR_PRI_WORDS     BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
 #define CPUPRI_INVALID -1
 #define CPUPRI_IDLE     0
@@ -12,14 +11,12 @@
 /* values 2-101 are RT priorities 0-99 */
 struct cpupri_vec {
-        raw_spinlock_t lock;
+        atomic_t        count;
-        int        count;
+        cpumask_var_t   mask;
-        cpumask_var_t mask;
 };
 struct cpupri {
        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-        long              pri_active[CPUPRI_NR_PRI_WORDS];
        int               cpu_to_pri[NR_CPUS];
 };
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
 /*
- * kernel/time/sched_debug.c
+ * kernel/sched/debug.c
 *
 * Print the CFS rbtree
 *
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+#include "sched.h"
 static DEFINE_SPINLOCK(sched_debug_lock);
 /*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
        return 0;
 }
-static void sysrq_sched_debug_show(void)
+void sysrq_sched_debug_show(void)
 {
        sched_debug_show(NULL, NULL);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index bc8ee9993814..84adb2d66cbd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
 #include <linux/latencytop.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
+#include <linux/slab.h>
+#include <linux/profile.h>
+#include <linux/interrupt.h>
+#include <trace/events/sched.h>
+#include "sched.h"
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -89,7 +96,124 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 */
 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
-static const struct sched_class fair_sched_class;
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static int get_update_sysctl_factor(void)
+{
+        unsigned int cpus = min_t(int, num_online_cpus(), 8);
+        unsigned int factor;
+        switch (sysctl_sched_tunable_scaling) {
+        case SCHED_TUNABLESCALING_NONE:
+                factor = 1;
+                break;
+        case SCHED_TUNABLESCALING_LINEAR:
+                factor = cpus;
+                break;
+        case SCHED_TUNABLESCALING_LOG:
+        default:
+                factor = 1 + ilog2(cpus);
+                break;
+        }
+        return factor;
+}
+static void update_sysctl(void)
+{
+        unsigned int factor = get_update_sysctl_factor();
+#define SET_SYSCTL(name) \
+        (sysctl_##name = (factor) * normalized_sysctl_##name)
+        SET_SYSCTL(sched_min_granularity);
+        SET_SYSCTL(sched_latency);
+        SET_SYSCTL(sched_wakeup_granularity);
+#undef SET_SYSCTL
+}
+void sched_init_granularity(void)
+{
+        update_sysctl();
+}
+#if BITS_PER_LONG == 32
+# define WMULT_CONST    (~0UL)
+#else
+# define WMULT_CONST    (1UL << 32)
+#endif
+#define WMULT_SHIFT     32
+/*
+ * Shift right and round:
+ */
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+/*
+ * delta *= weight / lw
+ */
+static unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+                struct load_weight *lw)
+{
+        u64 tmp;
+        /*
+         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+         * 2^SCHED_LOAD_RESOLUTION.
+         */
+        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+                tmp = (u64)delta_exec * scale_load_down(weight);
+        else
+                tmp = (u64)delta_exec;
+        if (!lw->inv_weight) {
+                unsigned long w = scale_load_down(lw->weight);
+                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                        lw->inv_weight = 1;
+                else if (unlikely(!w))
+                        lw->inv_weight = WMULT_CONST;
+                else
+                        lw->inv_weight = WMULT_CONST / w;
+        }
+        /*
+         * Check whether we'd overflow the 64-bit multiplication:
+         */
+        if (unlikely(tmp > WMULT_CONST))
+                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+                        WMULT_SHIFT/2);
+        else
+                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+}
+const struct sched_class fair_sched_class;
 /**************************************************************
 * CFS operations on generic schedulable entities:
@@ -292,6 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                   unsigned long delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -397,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *left = cfs_rq->rb_leftmost;
@@ -418,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 }
 #ifdef CONFIG_SCHED_DEBUG
-static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -583,6 +709,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
                cpuacct_charge(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
        }
+        account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 static inline void
@@ -666,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
-                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, se->load.weight);
                list_add(&se->group_node, &cfs_rq->tasks);
@@ -679,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
-                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, -se->load.weight);
                list_del_init(&se->group_node);
@@ -688,6 +816,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
                                            int global_update)
@@ -710,7 +840,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
        u64 now, delta;
        unsigned long load = cfs_rq->load.weight;
-        if (cfs_rq->tg == &root_task_group)
+        if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
                return;
        now = rq_of(cfs_rq)->clock_task;
@@ -752,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
                list_del_leaf_cfs_rq(cfs_rq);
 }
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+        long tg_weight;
+        /*
+         * Use this CPU's actual weight instead of the last load_contribution
+         * to gain a more accurate current total weight. See
+         * update_cfs_rq_load_contribution().
+         */
+        tg_weight = atomic_read(&tg->load_weight);
+        tg_weight -= cfs_rq->load_contribution;
+        tg_weight += cfs_rq->load.weight;
+        return tg_weight;
+}
 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
-        long load_weight, load, shares;
+        long tg_weight, load, shares;
+        tg_weight = calc_tg_weight(tg, cfs_rq);
        load = cfs_rq->load.weight;
-        load_weight = atomic_read(&tg->load_weight);
-        load_weight += load;
-        load_weight -= cfs_rq->load_contribution;
        shares = (tg->shares * load);
-        if (load_weight)
+        if (tg_weight)
-                shares /= load_weight;
+                shares /= tg_weight;
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
@@ -819,7 +962,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
        tg = cfs_rq->tg;
        se = tg->se[cpu_of(rq_of(cfs_rq))];
-        if (!se)
+        if (!se || throttled_hierarchy(cfs_rq))
                return;
 #ifndef CONFIG_SMP
        if (likely(se->load.weight == tg->shares))
@@ -860,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                if (unlikely(delta > se->statistics.sleep_max))
                        se->statistics.sleep_max = delta;
-                se->statistics.sleep_start = 0;
                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
@@ -877,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                if (unlikely(delta > se->statistics.block_max))
                        se->statistics.block_max = delta;
-                se->statistics.block_start = 0;
                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
@@ -887,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                                trace_sched_stat_iowait(tsk, delta);
                        }
+                        trace_sched_stat_blocked(tsk, delta);
                        /*
                         * Blocking time is in units of nanosecs, so shift by
                         * 20 to get a milliseconds-range estimation of the
@@ -950,6 +1093,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -979,8 +1124,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
-        if (cfs_rq->nr_running == 1)
+        if (cfs_rq->nr_running == 1) {
                list_add_leaf_cfs_rq(cfs_rq);
+                check_enqueue_throttle(cfs_rq);
+        }
 }
 static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1175,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                __clear_buddies_skip(se);
 }
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -1066,6 +1215,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        /* return excess runtime on last dequeue */
+        return_cfs_rq_runtime(cfs_rq);
        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
 }
@@ -1077,6 +1229,8 @@ static void
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
        unsigned long ideal_runtime, delta_exec;
+        struct sched_entity *se;
+        s64 delta;
        ideal_runtime = sched_slice(cfs_rq, curr);
        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1249,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         * narrow margin doesn't have to wait for a full slice.
         * This also mitigates buddy induced latencies under load.
         */
-        if (!sched_feat(WAKEUP_PREEMPT))
-                return;
        if (delta_exec < sysctl_sched_min_granularity)
                return;
-        if (cfs_rq->nr_running > 1) {
+        se = __pick_first_entity(cfs_rq);
-                struct sched_entity *se = __pick_first_entity(cfs_rq);
+        delta = curr->vruntime - se->vruntime;
-                s64 delta = curr->vruntime - se->vruntime;
-                if (delta < 0)
+        if (delta < 0)
-                        return;
+                return;
-                if (delta > ideal_runtime)
+        if (delta > ideal_runtime)
-                        resched_task(rq_of(cfs_rq)->curr);
+                resched_task(rq_of(cfs_rq)->curr);
-        }
 }
 static void
@@ -1185,6 +1334,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        return se;
 }
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
        /*
@@ -1194,6 +1345,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
        if (prev->on_rq)
                update_curr(cfs_rq);
+        /* throttle cfs_rqs exceeding runtime */
+        check_cfs_rq_runtime(cfs_rq);
        check_spread(cfs_rq, prev);
        if (prev->on_rq) {
                update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1387,742 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
                return;
 #endif
-        if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+        if (cfs_rq->nr_running > 1)
                check_preempt_tick(cfs_rq, curr);
 }
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+static inline bool cfs_bandwidth_used(void)
+{
+        return static_branch(&__cfs_bandwidth_used);
+}
+void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+        /* only need to count groups transitioning between enabled/!enabled */
+        if (enabled && !was_enabled)
+                jump_label_inc(&__cfs_bandwidth_used);
+        else if (!enabled && was_enabled)
+                jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+        return true;
+}
+void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+        return 100000000ULL;
+}
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+        u64 now;
+        if (cfs_b->quota == RUNTIME_INF)
+                return;
+        now = sched_clock_cpu(smp_processor_id());
+        cfs_b->runtime = cfs_b->quota;
+        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return &tg->cfs_bandwidth;
+}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct task_group *tg = cfs_rq->tg;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        u64 amount = 0, min_amount, expires;
+        /* note: this is a positive sum as runtime_remaining <= 0 */
+        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota == RUNTIME_INF)
+                amount = min_amount;
+        else {
+                /*
+                 * If the bandwidth pool has become inactive, then at least one
+                 * period must have elapsed since the last consumption.
+                 * Refresh the global state and ensure bandwidth timer becomes
+                 * active.
+                 */
+                if (!cfs_b->timer_active) {
+                        __refill_cfs_bandwidth_runtime(cfs_b);
+                        __start_cfs_bandwidth(cfs_b);
+                }
+                if (cfs_b->runtime > 0) {
+                        amount = min(cfs_b->runtime, min_amount);
+                        cfs_b->runtime -= amount;
+                        cfs_b->idle = 0;
+                }
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        cfs_rq->runtime_remaining += amount;
+        /*
+         * we may have advanced our local expiration to account for allowed
+         * spread between our sched_clock and the one on which runtime was
+         * issued.
+         */
+        if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+                cfs_rq->runtime_expires = expires;
+        return cfs_rq->runtime_remaining > 0;
+}
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct rq *rq = rq_of(cfs_rq);
+        /* if the deadline is ahead of our clock, nothing to do */
+        if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+                return;
+        if (cfs_rq->runtime_remaining < 0)
+                return;
+        /*
+         * If the local deadline has passed we have to consider the
+         * possibility that our sched_clock is 'fast' and the global deadline
+         * has not truly expired.
+         *
+         * Fortunately we can check determine whether this the case by checking
+         * whether the global deadline has advanced.
+         */
+        if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+                /* extend local deadline, drift is bounded above by 2 ticks */
+                cfs_rq->runtime_expires += TICK_NSEC;
+        } else {
+                /* global deadline is ahead, expiration has passed */
+                cfs_rq->runtime_remaining = 0;
+        }
+}
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec)
+{
+        /* dock delta_exec before expiring quota (as it could span periods) */
+        cfs_rq->runtime_remaining -= delta_exec;
+        expire_cfs_rq_runtime(cfs_rq);
+        if (likely(cfs_rq->runtime_remaining > 0))
+                return;
+        /*
+         * if we're unable to extend our runtime we resched so that the active
+         * hierarchy can be throttled
+         */
+        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+                resched_task(rq_of(cfs_rq)->curr);
+}
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                                   unsigned long delta_exec)
+{
+        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+                return;
+        __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+        return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+        return cfs_bandwidth_used() && cfs_rq->throttle_count;
+}
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+                                    int src_cpu, int dest_cpu)
+{
+        struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+        src_cfs_rq = tg->cfs_rq[src_cpu];
+        dest_cfs_rq = tg->cfs_rq[dest_cpu];
+        return throttled_hierarchy(src_cfs_rq) ||
+               throttled_hierarchy(dest_cfs_rq);
+}
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+        struct rq *rq = data;
+        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+        cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+        if (!cfs_rq->throttle_count) {
+                u64 delta = rq->clock_task - cfs_rq->load_stamp;
+                /* leaving throttled state, advance shares averaging windows */
+                cfs_rq->load_stamp += delta;
+                cfs_rq->load_last += delta;
+                /* update entity weight now that we are on_rq again */
+                update_cfs_shares(cfs_rq);
+        }
+#endif
+        return 0;
+}
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+        struct rq *rq = data;
+        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+        /* group is entering throttled state, record last load */
+        if (!cfs_rq->throttle_count)
+                update_cfs_load(cfs_rq, 0);
+        cfs_rq->throttle_count++;
+        return 0;
+}
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct sched_entity *se;
+        long task_delta, dequeue = 1;
+        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+        /* account load preceding throttle */
+        rcu_read_lock();
+        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+        rcu_read_unlock();
+        task_delta = cfs_rq->h_nr_running;
+        for_each_sched_entity(se) {
+                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+                /* throttled entity or throttle-on-deactivate */
+                if (!se->on_rq)
+                        break;
+                if (dequeue)
+                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+                qcfs_rq->h_nr_running -= task_delta;
+                if (qcfs_rq->load.weight)
+                        dequeue = 0;
+        }
+        if (!se)
+                rq->nr_running -= task_delta;
+        cfs_rq->throttled = 1;
+        cfs_rq->throttled_timestamp = rq->clock;
+        raw_spin_lock(&cfs_b->lock);
+        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        raw_spin_unlock(&cfs_b->lock);
+}
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct sched_entity *se;
+        int enqueue = 1;
+        long task_delta;
+        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+        cfs_rq->throttled = 0;
+        raw_spin_lock(&cfs_b->lock);
+        cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+        list_del_rcu(&cfs_rq->throttled_list);
+        raw_spin_unlock(&cfs_b->lock);
+        cfs_rq->throttled_timestamp = 0;
+        update_rq_clock(rq);
+        /* update hierarchical throttle state */
+        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+        if (!cfs_rq->load.weight)
+                return;
+        task_delta = cfs_rq->h_nr_running;
+        for_each_sched_entity(se) {
+                if (se->on_rq)
+                        enqueue = 0;
+                cfs_rq = cfs_rq_of(se);
+                if (enqueue)
+                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+                cfs_rq->h_nr_running += task_delta;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+        }
+        if (!se)
+                rq->nr_running += task_delta;
+        /* determine whether we need to wake up potentially idle cpu */
+        if (rq->curr == rq->idle && rq->cfs.nr_running)
+                resched_task(rq->curr);
+}
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+                u64 remaining, u64 expires)
+{
+        struct cfs_rq *cfs_rq;
+        u64 runtime = remaining;
+        rcu_read_lock();
+        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+                                throttled_list) {
+                struct rq *rq = rq_of(cfs_rq);
+                raw_spin_lock(&rq->lock);
+                if (!cfs_rq_throttled(cfs_rq))
+                        goto next;
+                runtime = -cfs_rq->runtime_remaining + 1;
+                if (runtime > remaining)
+                        runtime = remaining;
+                remaining -= runtime;
+                cfs_rq->runtime_remaining += runtime;
+                cfs_rq->runtime_expires = expires;
+                /* we check whether we're throttled above */
+                if (cfs_rq->runtime_remaining > 0)
+                        unthrottle_cfs_rq(cfs_rq);
+next:
+                raw_spin_unlock(&rq->lock);
+                if (!remaining)
+                        break;
+        }
+        rcu_read_unlock();
+        return remaining;
+}
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+        u64 runtime, runtime_expires;
+        int idle = 1, throttled;
+        raw_spin_lock(&cfs_b->lock);
+        /* no need to continue the timer with no bandwidth constraint */
+        if (cfs_b->quota == RUNTIME_INF)
+                goto out_unlock;
+        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+        /* idle depends on !throttled (for the case of a large deficit) */
+        idle = cfs_b->idle && !throttled;
+        cfs_b->nr_periods += overrun;
+        /* if we're going inactive then everything else can be deferred */
+        if (idle)
+                goto out_unlock;
+        __refill_cfs_bandwidth_runtime(cfs_b);
+        if (!throttled) {
+                /* mark as potentially idle for the upcoming period */
+                cfs_b->idle = 1;
+                goto out_unlock;
+        }
+        /* account preceding periods in which throttling occurred */
+        cfs_b->nr_throttled += overrun;
+        /*
+         * There are throttled entities so we must first use the new bandwidth
+         * to unthrottle them before making it generally available.  This
+         * ensures that all existing debts will be paid before a new cfs_rq is
+         * allowed to run.
+         */
+        runtime = cfs_b->runtime;
+        runtime_expires = cfs_b->runtime_expires;
+        cfs_b->runtime = 0;
+        /*
+         * This check is repeated as we are holding onto the new bandwidth
+         * while we unthrottle.  This can potentially race with an unthrottled
+         * group trying to acquire new bandwidth from the global pool.
+         */
+        while (throttled && runtime > 0) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* we can't nest cfs_b->lock while distributing bandwidth */
+                runtime = distribute_cfs_runtime(cfs_b, runtime,
+                                                 runtime_expires);
+                raw_spin_lock(&cfs_b->lock);
+                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+        }
+        /* return (any) remaining runtime */
+        cfs_b->runtime = runtime;
+        /*
+         * While we are ensured activity in the period following an
+         * unthrottle, this also covers the case in which the new bandwidth is
+         * insufficient to cover the existing bandwidth deficit.  (Forcing the
+         * timer to remain active while there are any throttled entities.)
+         */
+        cfs_b->idle = 0;
+out_unlock:
+        if (idle)
+                cfs_b->timer_active = 0;
+        raw_spin_unlock(&cfs_b->lock);
+        return idle;
+}
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+        struct hrtimer *refresh_timer = &cfs_b->period_timer;
+        u64 remaining;
+        /* if the call-back is running a quota refresh is already occurring */
+        if (hrtimer_callback_running(refresh_timer))
+                return 1;
+        /* is a quota refresh about to occur? */
+        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+        if (remaining < min_expire)
+                return 1;
+        return 0;
+}
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+        /* if there's a quota refresh soon don't bother with slack */
+        if (runtime_refresh_within(cfs_b, min_left))
+                return;
+        start_bandwidth_timer(&cfs_b->slack_timer,
+                                ns_to_ktime(cfs_bandwidth_slack_period));
+}
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+        if (slack_runtime <= 0)
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF &&
+            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+                cfs_b->runtime += slack_runtime;
+                /* we are under rq->lock, defer unthrottling using a timer */
+                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+                    !list_empty(&cfs_b->throttled_cfs_rq))
+                        start_cfs_slack_bandwidth(cfs_b);
+        }
+        raw_spin_unlock(&cfs_b->lock);
+        /* even if it's not valid for return we don't want to try again */
+        cfs_rq->runtime_remaining -= slack_runtime;
+}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_bandwidth_used())
+                return;
+        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+                return;
+        __return_cfs_rq_runtime(cfs_rq);
+}
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+        u64 expires;
+        /* confirm we're still not at a refresh boundary */
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+                runtime = cfs_b->runtime;
+                cfs_b->runtime = 0;
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        if (!runtime)
+                return;
+        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+        raw_spin_lock(&cfs_b->lock);
+        if (expires == cfs_b->runtime_expires)
+                cfs_b->runtime = runtime;
+        raw_spin_unlock(&cfs_b->lock);
+}
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_bandwidth_used())
+                return;
+        /* an active group must be handled by the update_curr()->put() path */
+        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+                return;
+        /* ensure the group is not already throttled */
+        if (cfs_rq_throttled(cfs_rq))
+                return;
+        /* update runtime allocation */
+        account_cfs_rq_runtime(cfs_rq, 0);
+        if (cfs_rq->runtime_remaining <= 0)
+                throttle_cfs_rq(cfs_rq);
+}
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_bandwidth_used())
+                return;
+        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+                return;
+        /*
+         * it's possible for a throttled entity to be forced into a running
+         * state (e.g. set_curr_task), in this case we're finished.
+         */
+        if (cfs_rq_throttled(cfs_rq))
+                return;
+        throttle_cfs_rq(cfs_rq);
+}
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, slack_timer);
+        do_sched_cfs_slack_timer(cfs_b);
+        return HRTIMER_NORESTART;
+}
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, period_timer);
+        ktime_t now;
+        int overrun;
+        int idle = 0;
+        for (;;) {
+                now = hrtimer_cb_get_time(timer);
+                overrun = hrtimer_forward(timer, now, cfs_b->period);
+                if (!overrun)
+                        break;
+                idle = do_sched_cfs_period_timer(cfs_b, overrun);
+        }
+        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        raw_spin_lock_init(&cfs_b->lock);
+        cfs_b->runtime = 0;
+        cfs_b->quota = RUNTIME_INF;
+        cfs_b->period = ns_to_ktime(default_cfs_period());
+        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->period_timer.function = sched_cfs_period_timer;
+        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        cfs_rq->runtime_enabled = 0;
+        INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+/* requires cfs_b->lock, may release to reprogram timer */
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        /*
+         * The timer may be active because we're trying to set a new bandwidth
+         * period or because we're racing with the tear-down path
+         * (timer_active==0 becomes visible before the hrtimer call-back
+         * terminates).  In either case we ensure that it's re-programmed
+         */
+        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* ensure cfs_b->lock is available while we wait */
+                hrtimer_cancel(&cfs_b->period_timer);
+                raw_spin_lock(&cfs_b->lock);
+                /* if someone else restarted the timer then we're done */
+                if (cfs_b->timer_active)
+                        return;
+        }
+        cfs_b->timer_active = 1;
+        start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        hrtimer_cancel(&cfs_b->period_timer);
+        hrtimer_cancel(&cfs_b->slack_timer);
+}
+void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+        struct cfs_rq *cfs_rq;
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+                if (!cfs_rq->runtime_enabled)
+                        continue;
+                /*
+                 * clock_task is not advancing so we just need to make sure
+                 * there's some valid quota amount
+                 */
+                cfs_rq->runtime_remaining = cfs_b->quota;
+                if (cfs_rq_throttled(cfs_rq))
+                        unthrottle_cfs_rq(cfs_rq);
+        }
+}
+#else /* CONFIG_CFS_BANDWIDTH */
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+        return 0;
+}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+        return 0;
+}
+static inline int throttled_lb_pair(struct task_group *tg,
+                                    int src_cpu, int dest_cpu)
+{
+        return 0;
+}
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+#endif
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return NULL;
+}
+static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif /* CONFIG_CFS_BANDWIDTH */
 /**************************************************
 * CFS operations on tasks:
 */
@@ -1249,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
        WARN_ON(task_rq(p) != rq);
-        if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+        if (cfs_rq->nr_running > 1) {
                u64 slice = sched_slice(cfs_rq, se);
                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                s64 delta = slice - ran;
@@ -1280,7 +2166,7 @@ static void hrtick_update(struct rq *rq)
 {
        struct task_struct *curr = rq->curr;
-        if (curr->sched_class != &fair_sched_class)
+        if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
                return;
        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -1313,16 +2199,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                        break;
                cfs_rq = cfs_rq_of(se);
                enqueue_entity(cfs_rq, se, flags);
+                /*
+                 * end evaluation on encountering a throttled cfs_rq
+                 *
+                 * note: in the case of encountering a throttled cfs_rq we will
+                 * post the final h_nr_running increment below.
+                */
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+                cfs_rq->h_nr_running++;
                flags = ENQUEUE_WAKEUP;
        }
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_nr_running++;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
        }
+        if (!se)
+                inc_nr_running(rq);
        hrtick_update(rq);
 }
@@ -1343,6 +2246,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
+                /*
+                 * end evaluation on encountering a throttled cfs_rq
+                 *
+                 * note: in the case of encountering a throttled cfs_rq we will
+                 * post the final h_nr_running decrement below.
+                */
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+                cfs_rq->h_nr_running--;
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
                        /*
@@ -1361,15 +2274,76 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_nr_running--;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
        }
+        if (!se)
+                dec_nr_running(rq);
        hrtick_update(rq);
 }
 #ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+        return cpu_rq(cpu)->load.weight;
+}
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
+        if (type == 0 || !sched_feat(LB_BIAS))
+                return total;
+        return min(rq->cpu_load[type-1], total);
+}
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
+        if (type == 0 || !sched_feat(LB_BIAS))
+                return total;
+        return max(rq->cpu_load[type-1], total);
+}
+static unsigned long power_of(int cpu)
+{
+        return cpu_rq(cpu)->cpu_power;
+}
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+        if (nr_running)
+                return rq->load.weight / nr_running;
+        return 0;
+}
 static void task_waking_fair(struct task_struct *p)
 {
@@ -1399,42 +2373,105 @@ static void task_waking_fair(struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ *   s_i = rw_i / \Sum rw_j                                             (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ *   rw_i = {   2,   4,   1,   0 }
+ *   s_i  = { 2/7, 4/7, 1/7,   0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ *   rw'_i = {   3,   4,   1,   0 }
+ *   s'_i  = { 3/8, 4/8, 1/8,   0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ *   dw_i = S * (s'_i - s_i)                                            (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
 */
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)
+        if (!tg->parent)        /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
-                long lw, w;
+                long w, W;
                tg = se->my_q->tg;
-                w = se->my_q->load.weight;
-                /* use this cpu's instantaneous contribution */
+                /*
-                lw = atomic_read(&tg->load_weight);
+                 * W = @wg + \Sum rw_j
-                lw -= se->my_q->load_contribution;
+                 */
-                lw += w + wg;
+                W = wg + calc_tg_weight(tg, se->my_q);
-                wl += w;
+                /*
+                 * w = rw_i + @wl
+                 */
+                w = se->my_q->load.weight + wl;
-                if (lw > 0 && wl < lw)
+                /*
-                        wl = (wl * tg->shares) / lw;
+                 * wl = S * s'_i; see (2)
+                 */
+                if (W > 0 && w < W)
+                        wl = (w * tg->shares) / W;
                else
                        wl = tg->shares;
-                /* zero point is MIN_SHARES */
+                /*
+                 * Per the above, wl is the new se->load.weight value; since
+                 * those are clipped to [MIN_SHARES, ...) do so now. See
+                 * calc_cfs_shares().
+                 */
                if (wl < MIN_SHARES)
                        wl = MIN_SHARES;
+                /*
+                 * wl = dw_i = S * (s'_i - s_i); see (3)
+                 */
                wl -= se->load.weight;
+                /*
+                 * Recursively apply this logic to all parent groups to compute
+                 * the final effective load change on the root group. Since
+                 * only the @tg group gets extra weight, all parent groups can
+                 * only redistribute existing shares. @wl is the shift in shares
+                 * resulting from this level per the above.
+                 */
                wg = 0;
        }
        return wl;
 }
 #else
 static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                /* Skip over this group if it has no CPUs allowed */
                if (!cpumask_intersects(sched_group_cpus(group),
-                                        &p->cpus_allowed))
+                                        tsk_cpus_allowed(p)))
                        continue;
                local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2630,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        int i;
        /* Traverse only the allowed CPUs */
-        for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                load = weighted_cpuload(i);
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1613,6 +2650,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
+        struct sched_group *sg;
        int i;
        /*
@@ -1633,25 +2671,28 @@ static int select_idle_sibling(struct task_struct *p, int target)
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        rcu_read_lock();
-        for_each_domain(target, sd) {
-                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
-                        break;
-                for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+        sd = rcu_dereference(per_cpu(sd_llc, target));
-                        if (idle_cpu(i)) {
+        for_each_lower_domain(sd) {
-                                target = i;
+                sg = sd->groups;
-                                break;
+                do {
+                        if (!cpumask_intersects(sched_group_cpus(sg),
+                                                tsk_cpus_allowed(p)))
+                                goto next;
+                        for_each_cpu(i, sched_group_cpus(sg)) {
+                                if (!idle_cpu(i))
+                                        goto next;
                        }
-                }
-                /*
+                        target = cpumask_first_and(sched_group_cpus(sg),
-                 * Lets stop looking for an idle sibling when we reached
+                                        tsk_cpus_allowed(p));
-                 * the domain that spans the current cpu and prev_cpu.
+                        goto done;
-                 */
+next:
-                if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                        sg = sg->next;
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+                } while (sg != sd->groups);
-                        break;
        }
+done:
        rcu_read_unlock();
        return target;
@@ -1679,8 +2720,11 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
+        if (p->rt.nr_cpus_allowed == 1)
+                return prev_cpu;
        if (sd_flag & SD_BALANCE_WAKE) {
-                if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+                if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
                        want_affine = 1;
                new_cpu = prev_cpu;
        }
@@ -1875,6 +2919,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(se == pse))
                return;
+        /*
+         * This is possible from callers such as pull_task(), in which we
+         * unconditionally check_prempt_curr() after an enqueue (which may have
+         * lead to a throttle).  This both saves work and prevents false
+         * next-buddy nomination below.
+         */
+        if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+                return;
        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
                set_next_buddy(pse);
                next_buddy_marked = 1;
@@ -1883,6 +2936,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
         * wake up path.
+         *
+         * Note: this also catches the edge-case of curr being in a throttled
+         * group (e.g. via set_curr_task), since update_curr() (in the
+         * enqueue of curr) will have resulted in resched being set.  This
+         * prevents us from potentially nominating it as a false LAST_BUDDY
+         * below.
         */
        if (test_tsk_need_resched(curr))
                return;
@@ -1899,10 +2958,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(p->policy != SCHED_NORMAL))
                return;
-        if (!sched_feat(WAKEUP_PREEMPT))
-                return;
        find_matching_se(&se, &pse);
        update_curr(cfs_rq_of(se));
        BUG_ON(!pse);
@@ -1952,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        } while (cfs_rq);
        p = task_of(se);
-        hrtick_start_fair(rq, p);
+        if (hrtick_enabled(rq))
+                hrtick_start_fair(rq, p);
        return p;
 }
@@ -1996,6 +3052,12 @@ static void yield_task_fair(struct rq *rq)
                 * Update run-time statistics of the 'current'.
                 */
                update_curr(cfs_rq);
+                /*
+                 * Tell update_rq_clock() that we've just updated,
+                 * so we don't do microscopic update in schedule()
+                 * and double the fastpath cost.
+                 */
+                 rq->skip_clock_update = 1;
        }
        set_skip_buddy(se);
@@ -2005,7 +3067,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 {
        struct sched_entity *se = &p->se;
-        if (!se->on_rq)
+        /* throttled hierarchies are not runnable */
+        if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
                return false;
        /* Tell the scheduler that we'd really like pse to run next. */
@@ -2035,12 +3098,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 }
 /*
+ * Is this task likely cache-hot:
+ */
+static int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+        s64 delta;
+        if (p->sched_class != &fair_sched_class)
+                return 0;
+        if (unlikely(p->policy == SCHED_IDLE))
+                return 0;
+        /*
+         * Buddy candidates are cache hot:
+         */
+        if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+                        (&p->se == cfs_rq_of(&p->se)->next ||
+                         &p->se == cfs_rq_of(&p->se)->last))
+                return 1;
+        if (sysctl_sched_migration_cost == -1)
+                return 1;
+        if (sysctl_sched_migration_cost == 0)
+                return 0;
+        delta = now - p->se.exec_start;
+        return delta < (s64)sysctl_sched_migration_cost;
+}
+#define LBF_ALL_PINNED  0x01
+#define LBF_NEED_BREAK  0x02    /* clears into HAD_BREAK */
+#define LBF_HAD_BREAK   0x04
+#define LBF_HAD_BREAKS  0x0C    /* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT       0x10
+/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
+                     int *lb_flags)
 {
        int tsk_cache_hot = 0;
        /*
@@ -2049,11 +3150,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+        if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                return 0;
        }
-        *all_pinned = 0;
+        *lb_flags &= ~LBF_ALL_PINNED;
        if (task_running(rq, p)) {
                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2102,6 +3203,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
        for_each_leaf_cfs_rq(busiest, cfs_rq) {
                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                        if (throttled_lb_pair(task_group(p),
+                                              busiest->cpu, this_cpu))
+                                break;
                        if (!can_migrate_task(p, busiest, this_cpu,
                                                sd, idle, &pinned))
@@ -2124,7 +3228,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
+              enum cpu_idle_type idle, int *lb_flags,
              struct cfs_rq *busiest_cfs_rq)
 {
        int loops = 0, pulled = 0;
@@ -2135,12 +3239,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                goto out;
        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
-                if (loops++ > sysctl_sched_nr_migrate)
+                if (loops++ > sysctl_sched_nr_migrate) {
+                        *lb_flags |= LBF_NEED_BREAK;
                        break;
+                }
                if ((p->se.load.weight >> 1) > rem_load_move ||
                    !can_migrate_task(p, busiest, this_cpu, sd, idle,
-                                      all_pinned))
+                                      lb_flags))
                        continue;
                pull_task(busiest, p, this_rq, this_cpu);
@@ -2153,8 +3259,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 * kernels will stop after the first task is pulled to minimize
                 * the critical section.
                 */
-                if (idle == CPU_NEWLY_IDLE)
+                if (idle == CPU_NEWLY_IDLE) {
+                        *lb_flags |= LBF_ABORT;
                        break;
+                }
 #endif
                /*
@@ -2217,8 +3325,13 @@ static void update_shares(int cpu)
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
-        for_each_leaf_cfs_rq(rq, cfs_rq)
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                /* throttled entities do not contribute to load */
+                if (throttled_hierarchy(cfs_rq))
+                        continue;
                update_shares_cpu(cfs_rq->tg, cpu);
+        }
        rcu_read_unlock();
 }
@@ -2254,7 +3367,7 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned)
+                  int *lb_flags)
 {
        long rem_load_move = max_load_move;
        struct cfs_rq *busiest_cfs_rq;
@@ -2267,17 +3380,21 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
                u64 rem_load, moved_load;
+                if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+                        break;
                /*
-                 * empty group
+                 * empty group or part of a throttled hierarchy
                 */
-                if (!busiest_cfs_rq->task_weight)
+                if (!busiest_cfs_rq->task_weight ||
+                    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
                        continue;
                rem_load = (u64)rem_load_move * busiest_weight;
                rem_load = div_u64(rem_load, busiest_h_load + 1);
                moved_load = balance_tasks(this_rq, this_cpu, busiest,
-                                rem_load, sd, idle, all_pinned,
+                                rem_load, sd, idle, lb_flags,
                                busiest_cfs_rq);
                if (!moved_load)
@@ -2303,10 +3420,10 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned)
+                  int *lb_flags)
 {
        return balance_tasks(this_rq, this_cpu, busiest,
-                        max_load_move, sd, idle, all_pinned,
+                        max_load_move, sd, idle, lb_flags,
                        &busiest->cfs);
 }
 #endif
@@ -2321,29 +3438,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      unsigned long max_load_move,
                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned)
+                      int *lb_flags)
 {
        unsigned long total_load_moved = 0, load_moved;
        do {
                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned);
+                                sd, idle, lb_flags);
                total_load_moved += load_moved;
+                if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+                        break;
 #ifdef CONFIG_PREEMPT
                /*
                 * NEWIDLE balancing is a source of latency, so preemptible
                 * kernels will stop after the first task is pulled to minimize
                 * the critical section.
                 */
-                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
-                        break;
+                        *lb_flags |= LBF_ABORT;
-                if (raw_spin_is_contended(&this_rq->lock) ||
-                                raw_spin_is_contended(&busiest->lock))
                        break;
+                }
 #endif
        } while (load_moved && max_load_move > total_load_moved);
@@ -2405,15 +3523,6 @@ struct sg_lb_stats {
 };
 /**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-        return cpumask_first(sched_group_cpus(group));
-}
-/**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -2662,7 +3771,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
        sdg->sgp->power = power;
 }
-static void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_power(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
@@ -2854,7 +3963,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 }
 /**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
@@ -2928,11 +4037,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
        } while (sg != sd->groups);
 }
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
 /**
 * check_asym_packing - Check to see if the group is packed into the
 *                      sched doman.
@@ -3296,7 +4400,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 #define MAX_PINNED_INTERVAL     512
 /* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 static int need_active_balance(struct sched_domain *sd, int idle,
                               int busiest_cpu, int this_cpu)
@@ -3347,7 +4451,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, all_pinned = 0, active_balance = 0;
+        int ld_moved, lb_flags = 0, active_balance = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
@@ -3388,11 +4492,11 @@ redo:
                 * still unbalanced. ld_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
-                all_pinned = 1;
+                lb_flags |= LBF_ALL_PINNED;
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                      imbalance, sd, idle, &all_pinned);
+                                      imbalance, sd, idle, &lb_flags);
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
@@ -3402,8 +4506,18 @@ redo:
                if (ld_moved && this_cpu != smp_processor_id())
                        resched_cpu(this_cpu);
+                if (lb_flags & LBF_ABORT)
+                        goto out_balanced;
+                if (lb_flags & LBF_NEED_BREAK) {
+                        lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+                        if (lb_flags & LBF_ABORT)
+                                goto out_balanced;
+                        goto redo;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(all_pinned)) {
+                if (unlikely(lb_flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
                        if (!cpumask_empty(cpus))
                                goto redo;
@@ -3430,10 +4544,10 @@ redo:
                         * moved to this_cpu
                         */
                        if (!cpumask_test_cpu(this_cpu,
-                                              &busiest->curr->cpus_allowed)) {
+                                        tsk_cpus_allowed(busiest->curr))) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
                                                            flags);
-                                all_pinned = 1;
+                                lb_flags |= LBF_ALL_PINNED;
                                goto out_one_pinned;
                        }
@@ -3486,7 +4600,8 @@ out_balanced:
 out_one_pinned:
        /* tune up the balancing interval */
-        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+        if (((lb_flags & LBF_ALL_PINNED) &&
+                        sd->balance_interval < MAX_PINNED_INTERVAL) ||
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
@@ -3499,7 +4614,7 @@ out:
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
-static void idle_balance(int this_cpu, struct rq *this_rq)
+void idle_balance(int this_cpu, struct rq *this_rq)
 {
        struct sched_domain *sd;
        int pulled_task = 0;
@@ -3612,46 +4727,18 @@ out_unlock:
 }
 #ifdef CONFIG_NO_HZ
-static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
-static void trigger_sched_softirq(void *data)
-{
-        raise_softirq_irqoff(SCHED_SOFTIRQ);
-}
-static inline void init_sched_softirq_csd(struct call_single_data *csd)
-{
-        csd->func = trigger_sched_softirq;
-        csd->info = NULL;
-        csd->flags = 0;
-        csd->priv = 0;
-}
 /*
 * idle load balancing details
- * - One of the idle CPUs nominates itself as idle load_balancer, while
- *   entering idle.
- * - This idle load balancer CPU will also go into tickless mode when
- *   it is idle, just like all other idle CPUs
 * - When one of the busy CPUs notice that there may be an idle rebalancing
 *   needed, they will kick the idle load balancer, which then does idle
 *   load balancing for all the idle CPUs.
 */
 static struct {
-        atomic_t load_balancer;
-        atomic_t first_pick_cpu;
-        atomic_t second_pick_cpu;
        cpumask_var_t idle_cpus_mask;
-        cpumask_var_t grp_idle_mask;
+        atomic_t nr_cpus;
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
-int get_nohz_load_balancer(void)
-{
-        return atomic_read(&nohz.load_balancer);
-}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 /**
 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -3667,7 +4754,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
        struct sched_domain *sd;
        for_each_domain(cpu, sd)
-                if (sd && (sd->flags & flag))
+                if (sd->flags & flag)
                        break;
        return sd;
@@ -3688,33 +4775,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
                (sd && (sd->flags & flag)); sd = sd->parent)
 /**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group:  group to be checked for semi-idleness
- *
- * Returns:     1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
-        cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
-                                        sched_group_cpus(ilb_group));
-        /*
-         * A sched_group is semi-idle when it has atleast one busy cpu
-         * and atleast one idle cpu.
-         */
-        if (cpumask_empty(nohz.grp_idle_mask))
-                return 0;
-        if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
-                return 0;
-        return 1;
-}
-/**
 * find_new_ilb - Finds the optimum idle load balancer for nomination.
 * @cpu:        The cpu which is nominating a new idle_load_balancer.
 *
@@ -3728,9 +4788,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
 */
 static int find_new_ilb(int cpu)
 {
+        int ilb = cpumask_first(nohz.idle_cpus_mask);
+        struct sched_group *ilbg;
        struct sched_domain *sd;
-        struct sched_group *ilb_group;
-        int ilb = nr_cpu_ids;
        /*
         * Have idle load balancer selection from semi-idle packages only
@@ -3748,23 +4808,28 @@ static int find_new_ilb(int cpu)
        rcu_read_lock();
        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilb_group = sd->groups;
+                ilbg = sd->groups;
                do {
-                        if (is_semi_idle_group(ilb_group)) {
+                        if (ilbg->group_weight !=
-                                ilb = cpumask_first(nohz.grp_idle_mask);
+                                atomic_read(&ilbg->sgp->nr_busy_cpus)) {
+                                ilb = cpumask_first_and(nohz.idle_cpus_mask,
+                                                        sched_group_cpus(ilbg));
                                goto unlock;
                        }
-                        ilb_group = ilb_group->next;
+                        ilbg = ilbg->next;
-                } while (ilb_group != sd->groups);
+                } while (ilbg != sd->groups);
        }
 unlock:
        rcu_read_unlock();
 out_done:
-        return ilb;
+        if (ilb < nr_cpu_ids && idle_cpu(ilb))
+                return ilb;
+        return nr_cpu_ids;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
@@ -3784,94 +4849,68 @@ static void nohz_balancer_kick(int cpu)
        nohz.next_balance++;
-        ilb_cpu = get_nohz_load_balancer();
+        ilb_cpu = find_new_ilb(cpu);
-        if (ilb_cpu >= nr_cpu_ids) {
-                ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
-                if (ilb_cpu >= nr_cpu_ids)
-                        return;
-        }
-        if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+        if (ilb_cpu >= nr_cpu_ids)
-                struct call_single_data *cp;
+                return;
-                cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+        if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
-                cp = &per_cpu(remote_sched_softirq_cb, cpu);
+                return;
-                __smp_call_function_single(ilb_cpu, cp, 0);
+        /*
-        }
+         * Use smp_send_reschedule() instead of resched_cpu().
+         * This way we generate a sched IPI on the target cpu which
+         * is idle. And the softirq performing nohz idle load balance
+         * will be run before returning from the IPI.
+         */
+        smp_send_reschedule(ilb_cpu);
        return;
 }
-/*
+static inline void set_cpu_sd_state_busy(void)
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus.
- *
- * When the ilb owner becomes busy, we will not have new ilb owner until some
- * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
- * idle load balancing by kicking one of the idle CPUs.
- *
- * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
- * ilb owner CPU in future (when there is a need for idle load balancing on
- * behalf of all idle CPUs).
- */
-void select_nohz_load_balancer(int stop_tick)
 {
+        struct sched_domain *sd;
        int cpu = smp_processor_id();
-        if (stop_tick) {
+        if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                if (!cpu_active(cpu)) {
+                return;
-                        if (atomic_read(&nohz.load_balancer) != cpu)
+        clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-                                return;
-                        /*
-                         * If we are going offline and still the leader,
-                         * give up!
-                         */
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu,
-                                           nr_cpu_ids) != cpu)
-                                BUG();
-                        return;
+        rcu_read_lock();
-                }
+        for_each_domain(cpu, sd)
+                atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+        rcu_read_unlock();
+}
-                cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+void set_cpu_sd_state_idle(void)
+{
+        struct sched_domain *sd;
+        int cpu = smp_processor_id();
-                if (atomic_read(&nohz.first_pick_cpu) == cpu)
+        if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                        atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+                return;
-                if (atomic_read(&nohz.second_pick_cpu) == cpu)
+        set_bit(NOHZ_IDLE, nohz_flags(cpu));
-                        atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-                if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
+        rcu_read_lock();
-                        int new_ilb;
+        for_each_domain(cpu, sd)
+                atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+        rcu_read_unlock();
+}
-                        /* make me the ilb owner */
+/*
-                        if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+ * This routine will record that this cpu is going idle with tick stopped.
-                                           cpu) != nr_cpu_ids)
+ * This info will be used in performing idle load balancing in the future.
-                                return;
+ */
+void select_nohz_load_balancer(int stop_tick)
+{
+        int cpu = smp_processor_id();
-                        /*
+        if (stop_tick) {
-                         * Check to see if there is a more power-efficient
+                if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
-                         * ilb.
-                         */
-                        new_ilb = find_new_ilb(cpu);
-                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                                atomic_set(&nohz.load_balancer, nr_cpu_ids);
-                                resched_cpu(new_ilb);
-                                return;
-                        }
-                        return;
-                }
-        } else {
-                if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
                        return;
-                cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+                cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+                atomic_inc(&nohz.nr_cpus);
-                if (atomic_read(&nohz.load_balancer) == cpu)
+                set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu,
-                                           nr_cpu_ids) != cpu)
-                                BUG();
        }
        return;
 }
@@ -3885,7 +4924,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 * Scale the max load_balance interval with the number of CPUs in the system.
 * This trades load-balance latency on larger machines for less cross talk.
 */
-static void update_max_interval(void)
+void update_max_interval(void)
 {
        max_load_balance_interval = HZ*num_online_cpus()/10;
 }
@@ -3977,11 +5016,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
        struct rq *rq;
        int balance_cpu;
-        if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+        if (idle != CPU_IDLE ||
-                return;
+            !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+                goto end;
        for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-                if (balance_cpu == this_cpu)
+                if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
                        continue;
                /*
@@ -3989,10 +5029,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                 * work being done for other cpus. Next load
                 * balancing owner will pick it up.
                 */
-                if (need_resched()) {
+                if (need_resched())
-                        this_rq->nohz_balance_kick = 0;
                        break;
-                }
                raw_spin_lock_irq(&this_rq->lock);
                update_rq_clock(this_rq);
@@ -4006,53 +5044,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                        this_rq->next_balance = rq->next_balance;
        }
        nohz.next_balance = this_rq->next_balance;
-        this_rq->nohz_balance_kick = 0;
+end:
+        clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 /*
- * Current heuristic for kicking the idle load balancer
+ * Current heuristic for kicking the idle load balancer in the presence
- * - first_pick_cpu is the one of the busy CPUs. It will kick
+ * of an idle cpu is the system.
- *   idle load balancer when it has more than one process active. This
+ *   - This rq has more than one task.
- *   eliminates the need for idle load balancing altogether when we have
+ *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *   only one running process in the system (common case).
+ *     busy cpu's exceeding the group's power.
- * - If there are more than one busy CPU, idle load balancer may have
+ *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *     domain span are idle.
- *   SMT or core siblings and can run better if they move to different
- *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
- *   which will kick idle load balancer as soon as it has any load.
 */
 static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
        unsigned long now = jiffies;
-        int ret;
+        struct sched_domain *sd;
-        int first_pick_cpu, second_pick_cpu;
-        if (time_before(now, nohz.next_balance))
+        if (unlikely(idle_cpu(cpu)))
                return 0;
-        if (rq->idle_at_tick)
+       /*
-                return 0;
+        * We may be recently in ticked or tickless idle mode. At the first
+        * busy tick after returning from idle, we will update the busy stats.
+        */
+        set_cpu_sd_state_busy();
+        if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+                clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+                cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+                atomic_dec(&nohz.nr_cpus);
+        }
-        first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+        /*
-        second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+         * None are in tickless mode and hence no need for NOHZ idle load
+         * balancing.
+         */
+        if (likely(!atomic_read(&nohz.nr_cpus)))
+                return 0;
-        if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+        if (time_before(now, nohz.next_balance))
-            second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
                return 0;
-        ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+        if (rq->nr_running >= 2)
-        if (ret == nr_cpu_ids || ret == cpu) {
+                goto need_kick;
-                atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-                if (rq->nr_running > 1)
+        rcu_read_lock();
-                        return 1;
+        for_each_domain(cpu, sd) {
-        } else {
+                struct sched_group *sg = sd->groups;
-                ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+                struct sched_group_power *sgp = sg->sgp;
-                if (ret == nr_cpu_ids || ret == cpu) {
+                int nr_busy = atomic_read(&sgp->nr_busy_cpus);
-                        if (rq->nr_running)
-                                return 1;
+                if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-                }
+                        goto need_kick_unlock;
+                if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+                    && (cpumask_first_and(nohz.idle_cpus_mask,
+                                          sched_domain_span(sd)) < cpu))
+                        goto need_kick_unlock;
+                if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
+                        break;
        }
+        rcu_read_unlock();
        return 0;
+need_kick_unlock:
+        rcu_read_unlock();
+need_kick:
+        return 1;
 }
 #else
 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4066,7 +5126,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 {
        int this_cpu = smp_processor_id();
        struct rq *this_rq = cpu_rq(this_cpu);
-        enum cpu_idle_type idle = this_rq->idle_at_tick ?
+        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
        rebalance_domains(this_cpu, idle);
@@ -4087,14 +5147,14 @@ static inline int on_null_domain(int cpu)
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq, int cpu)
 {
        /* Don't need to rebalance while attached to NULL domain */
        if (time_after_eq(jiffies, rq->next_balance) &&
            likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ
-        else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+        if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
                nohz_balancer_kick(cpu);
 #endif
 }
@@ -4109,15 +5169,6 @@ static void rq_offline_fair(struct rq *rq)
        update_sysctl();
 }
-#else   /* CONFIG_SMP */
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
 #endif /* CONFIG_SMP */
 /*
@@ -4141,8 +5192,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 */
 static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(current);
+        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+        struct sched_entity *se = &p->se, *curr;
        int this_cpu = smp_processor_id();
        struct rq *rq = this_rq();
        unsigned long flags;
@@ -4151,6 +5202,9 @@ static void task_fork_fair(struct task_struct *p)
        update_rq_clock(rq);
+        cfs_rq = task_cfs_rq(current);
+        curr = cfs_rq->curr;
        if (unlikely(task_cpu(p) != this_cpu)) {
                rcu_read_lock();
                __set_task_cpu(p, this_cpu);
@@ -4251,8 +5305,23 @@ static void set_curr_task_fair(struct rq *rq)
 {
        struct sched_entity *se = &rq->curr->se;
-        for_each_sched_entity(se)
+        for_each_sched_entity(se) {
-                set_next_entity(cfs_rq_of(se), se);
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                set_next_entity(cfs_rq, se);
+                /* ensure bandwidth has been allocated on our new cfs_rq */
+                account_cfs_rq_runtime(cfs_rq, 0);
+        }
+}
+void init_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        cfs_rq->tasks_timeline = RB_ROOT;
+        INIT_LIST_HEAD(&cfs_rq->tasks);
+        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4271,13 +5340,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * to another cgroup's rq. This does somewhat interfere with the
         * fair sleeper stuff for the first placement, but who cares.
         */
+        /*
+         * When !on_rq, vruntime of the task has usually NOT been normalized.
+         * But there are some cases where it has already been normalized:
+         *
+         * - Moving a forked child which is waiting for being woken up by
+         *   wake_up_new_task().
+         * - Moving a task which has been woken up by try_to_wake_up() and
+         *   waiting for actually being woken up by sched_ttwu_pending().
+         *
+         * To prevent boost or penalty in the new cfs_rq caused by delta
+         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
+         */
+        if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+                on_rq = 1;
        if (!on_rq)
                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
        if (!on_rq)
                p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
+void free_fair_sched_group(struct task_group *tg)
+{
+        int i;
+        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+        for_each_possible_cpu(i) {
+                if (tg->cfs_rq)
+                        kfree(tg->cfs_rq[i]);
+                if (tg->se)
+                        kfree(tg->se[i]);
+        }
+        kfree(tg->cfs_rq);
+        kfree(tg->se);
+}
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        struct cfs_rq *cfs_rq;
+        struct sched_entity *se;
+        int i;
+        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->cfs_rq)
+                goto err;
+        tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->se)
+                goto err;
+        tg->shares = NICE_0_LOAD;
+        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+        for_each_possible_cpu(i) {
+                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                      GFP_KERNEL, cpu_to_node(i));
+                if (!cfs_rq)
+                        goto err;
+                se = kzalloc_node(sizeof(struct sched_entity),
+                                  GFP_KERNEL, cpu_to_node(i));
+                if (!se)
+                        goto err_free_rq;
+                init_cfs_rq(cfs_rq);
+                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+        }
+        return 1;
+err_free_rq:
+        kfree(cfs_rq);
+err:
+        return 0;
+}
+void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        /*
+        * Only empty task groups can be destroyed; so we can speculatively
+        * check on_list without danger of it being re-added.
+        */
+        if (!tg->cfs_rq[cpu]->on_list)
+                return;
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+                        struct sched_entity *se, int cpu,
+                        struct sched_entity *parent)
+{
+        struct rq *rq = cpu_rq(cpu);
+        cfs_rq->tg = tg;
+        cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+        /* allow initial update_cfs_load() to truncate */
+        cfs_rq->load_stamp = 1;
 #endif
+        init_cfs_rq_runtime(cfs_rq);
+        tg->cfs_rq[cpu] = cfs_rq;
+        tg->se[cpu] = se;
+        /* se could be NULL for root_task_group */
+        if (!se)
+                return;
+        if (!parent)
+                se->cfs_rq = &rq->cfs;
+        else
+                se->cfs_rq = parent->my_q;
+        se->my_q = cfs_rq;
+        update_load_set(&se->load, 0);
+        se->parent = parent;
+}
+static DEFINE_MUTEX(shares_mutex);
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+        int i;
+        unsigned long flags;
+        /*
+         * We can't change the weight of the root cgroup.
+         */
+        if (!tg->se[0])
+                return -EINVAL;
+        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+        mutex_lock(&shares_mutex);
+        if (tg->shares == shares)
+                goto done;
+        tg->shares = shares;
+        for_each_possible_cpu(i) {
+                struct rq *rq = cpu_rq(i);
+                struct sched_entity *se;
+                se = tg->se[i];
+                /* Propagate contribution to hierarchy */
+                raw_spin_lock_irqsave(&rq->lock, flags);
+                for_each_sched_entity(se)
+                        update_cfs_shares(group_cfs_rq(se));
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
+        }
+done:
+        mutex_unlock(&shares_mutex);
+        return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+void free_fair_sched_group(struct task_group *tg) { }
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        return 1;
+}
+void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
@@ -4297,7 +5535,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 /*
 * All the scheduling class methods:
 */
-static const struct sched_class fair_sched_class = {
+const struct sched_class fair_sched_class = {
        .next                   = &idle_sched_class,
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
@@ -4334,7 +5572,7 @@ static const struct sched_class fair_sched_class = {
 };
 #ifdef CONFIG_SCHED_DEBUG
-static void print_cfs_stats(struct seq_file *m, int cpu)
+void print_cfs_stats(struct seq_file *m, int cpu)
 {
        struct cfs_rq *cfs_rq;
@@ -4344,3 +5582,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
        rcu_read_unlock();
 }
 #endif
+__init void init_sched_fair_class(void)
+{
+#ifdef CONFIG_SMP
+        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+#ifdef CONFIG_NO_HZ
+        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+#endif
+#endif /* SMP */
+}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 2e74677cb040..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,18 +3,13 @@
 * them to run sooner, but does not allow tons of sleepers to
 * rip the spread apart.
 */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
 /*
 * Place new tasks ahead so that they do not starve already running
 * tasks
 */
-SCHED_FEAT(START_DEBIT, 1)
+SCHED_FEAT(START_DEBIT, true)
-/*
- * Should wakeups try to preempt running tasks.
- */
-SCHED_FEAT(WAKEUP_PREEMPT, 1)
 /*
 * Based on load and program behaviour, see if it makes sense to place
@@ -22,53 +17,54 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1)
 * improve cache locality. Typically used with SYNC wakeups as
 * generated by pipes and the like, see also SYNC_WAKEUPS.
 */
-SCHED_FEAT(AFFINE_WAKEUPS, 1)
+SCHED_FEAT(AFFINE_WAKEUPS, true)
 /*
 * Prefer to schedule the task we woke last (assuming it failed
 * wakeup-preemption), since its likely going to consume data we
 * touched, increases cache locality.
 */
-SCHED_FEAT(NEXT_BUDDY, 0)
+SCHED_FEAT(NEXT_BUDDY, false)
 /*
 * Prefer to schedule the task that ran last (when we did
 * wake-preempt) as that likely will touch the same data, increases
 * cache locality.
 */
-SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(LAST_BUDDY, true)
 /*
 * Consider buddies to be cache hot, decreases the likelyness of a
 * cache buddy being migrated away, increases cache locality.
 */
-SCHED_FEAT(CACHE_HOT_BUDDY, 1)
+SCHED_FEAT(CACHE_HOT_BUDDY, true)
 /*
 * Use arch dependent cpu power functions
 */
-SCHED_FEAT(ARCH_POWER, 0)
+SCHED_FEAT(ARCH_POWER, false)
-SCHED_FEAT(HRTICK, 0)
+SCHED_FEAT(HRTICK, false)
-SCHED_FEAT(DOUBLE_TICK, 0)
+SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, 1)
+SCHED_FEAT(LB_BIAS, true)
 /*
 * Spin-wait on mutex acquisition when the mutex owner is running on
 * another cpu -- assumes that when the owner is running, it will soon
 * release the lock. Decreases scheduling overhead.
 */
-SCHED_FEAT(OWNER_SPIN, 1)
+SCHED_FEAT(OWNER_SPIN, true)
 /*
 * Decrement CPU power based on time not spent running tasks
 */
-SCHED_FEAT(NONTASK_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, true)
 /*
 * Queue remote wakeups on the target CPU and process them
 * using the scheduler IPI. Reduces rq->lock contention/bounces.
 */
-SCHED_FEAT(TTWU_QUEUE, 1)
+SCHED_FEAT(TTWU_QUEUE, true)
-SCHED_FEAT(FORCE_SD_OVERLAP, 0)
+SCHED_FEAT(FORCE_SD_OVERLAP, false)
+SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
+#include "sched.h"
 /*
 * idle-task scheduling class.
 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
 /*
 * Simple, special scheduling class for the per-CPU idle tasks:
 */
-static const struct sched_class idle_sched_class = {
+const struct sched_class idle_sched_class = {
        /* .next is NULL */
        /* no enqueue/yield_task for idle tasks */
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index af1177858be3..3640ebbb466b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
 * policies)
 */
+#include "sched.h"
+#include <linux/slab.h>
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+struct rt_bandwidth def_rt_bandwidth;
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+        struct rt_bandwidth *rt_b =
+                container_of(timer, struct rt_bandwidth, rt_period_timer);
+        ktime_t now;
+        int overrun;
+        int idle = 0;
+        for (;;) {
+                now = hrtimer_cb_get_time(timer);
+                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+                if (!overrun)
+                        break;
+                idle = do_sched_rt_period_timer(rt_b, overrun);
+        }
+        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+        rt_b->rt_period = ns_to_ktime(period);
+        rt_b->rt_runtime = runtime;
+        raw_spin_lock_init(&rt_b->rt_runtime_lock);
+        hrtimer_init(&rt_b->rt_period_timer,
+                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        rt_b->rt_period_timer.function = sched_rt_period_timer;
+}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+                return;
+        if (hrtimer_active(&rt_b->rt_period_timer))
+                return;
+        raw_spin_lock(&rt_b->rt_runtime_lock);
+        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+        raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+        struct rt_prio_array *array;
+        int i;
+        array = &rt_rq->active;
+        for (i = 0; i < MAX_RT_PRIO; i++) {
+                INIT_LIST_HEAD(array->queue + i);
+                __clear_bit(i, array->bitmap);
+        }
+        /* delimiter for bitsearch: */
+        __set_bit(MAX_RT_PRIO, array->bitmap);
+#if defined CONFIG_SMP
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
+        rt_rq->highest_prio.next = MAX_RT_PRIO;
+        rt_rq->rt_nr_migratory = 0;
+        rt_rq->overloaded = 0;
+        plist_head_init(&rt_rq->pushable_tasks);
+#endif
+        rt_rq->rt_time = 0;
+        rt_rq->rt_throttled = 0;
+        rt_rq->rt_runtime = 0;
+        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+}
 #ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+        hrtimer_cancel(&rt_b->rt_period_timer);
+}
 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return rt_se->rt_rq;
 }
+void free_rt_sched_group(struct task_group *tg)
+{
+        int i;
+        if (tg->rt_se)
+                destroy_rt_bandwidth(&tg->rt_bandwidth);
+        for_each_possible_cpu(i) {
+                if (tg->rt_rq)
+                        kfree(tg->rt_rq[i]);
+                if (tg->rt_se)
+                        kfree(tg->rt_se[i]);
+        }
+        kfree(tg->rt_rq);
+        kfree(tg->rt_se);
+}
+void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+                struct sched_rt_entity *rt_se, int cpu,
+                struct sched_rt_entity *parent)
+{
+        struct rq *rq = cpu_rq(cpu);
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
+        rt_rq->rt_nr_boosted = 0;
+        rt_rq->rq = rq;
+        rt_rq->tg = tg;
+        tg->rt_rq[cpu] = rt_rq;
+        tg->rt_se[cpu] = rt_se;
+        if (!rt_se)
+                return;
+        if (!parent)
+                rt_se->rt_rq = &rq->rt;
+        else
+                rt_se->rt_rq = parent->my_q;
+        rt_se->my_q = rt_rq;
+        rt_se->parent = parent;
+        INIT_LIST_HEAD(&rt_se->run_list);
+}
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        struct rt_rq *rt_rq;
+        struct sched_rt_entity *rt_se;
+        int i;
+        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->rt_rq)
+                goto err;
+        tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->rt_se)
+                goto err;
+        init_rt_bandwidth(&tg->rt_bandwidth,
+                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+        for_each_possible_cpu(i) {
+                rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
+                if (!rt_rq)
+                        goto err;
+                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                     GFP_KERNEL, cpu_to_node(i));
+                if (!rt_se)
+                        goto err_free_rq;
+                init_rt_rq(rt_rq, cpu_rq(i));
+                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+        }
+        return 1;
+err_free_rq:
+        kfree(rt_rq);
+err:
+        return 0;
+}
 #else /* CONFIG_RT_GROUP_SCHED */
 #define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return &rq->rt;
 }
+void free_rt_sched_group(struct task_group *tg) { }
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        return 1;
+}
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_SMP
@@ -124,21 +300,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        update_rt_migration(rt_rq);
 }
+static inline int has_pushable_tasks(struct rq *rq)
+{
+        return !plist_head_empty(&rq->rt.pushable_tasks);
+}
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
        plist_node_init(&p->pushable_tasks, p->prio);
        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+        /* Update the highest prio pushable task */
+        if (p->prio < rq->rt.highest_prio.next)
+                rq->rt.highest_prio.next = p->prio;
 }
 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-}
-static inline int has_pushable_tasks(struct rq *rq)
+        /* Update the new highest prio pushable task */
-{
+        if (has_pushable_tasks(rq)) {
-        return !plist_head_empty(&rq->rt.pushable_tasks);
+                p = plist_first_entry(&rq->rt.pushable_tasks,
+                                      struct task_struct, pushable_tasks);
+                rq->rt.highest_prio.next = p->prio;
+        } else
+                rq->rt.highest_prio.next = MAX_RT_PRIO;
 }
 #else
@@ -544,10 +732,35 @@ static void enable_runtime(struct rq *rq)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
+int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        int cpu = (int)(long)hcpu;
+        switch (action) {
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                disable_runtime(cpu_rq(cpu));
+                return NOTIFY_OK;
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                enable_runtime(cpu_rq(cpu));
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
 static int balance_runtime(struct rt_rq *rt_rq)
 {
        int more = 0;
+        if (!sched_feat(RT_RUNTIME_SHARE))
+                return more;
        if (rt_rq->rt_time > rt_rq->rt_runtime) {
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                more = do_balance_runtime(rt_rq);
@@ -633,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
        if (rt_rq->rt_throttled)
                return rt_rq_throttled(rt_rq);
-        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
+        if (runtime >= sched_rt_period(rt_rq))
                return 0;
        balance_runtime(rt_rq);
@@ -643,6 +856,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
        if (rt_rq->rt_time > runtime) {
                rt_rq->rt_throttled = 1;
+                printk_once(KERN_WARNING "sched: RT throttling activated\n");
                if (rt_rq_throttled(rt_rq)) {
                        sched_rt_rq_dequeue(rt_rq);
                        return 1;
@@ -698,47 +912,13 @@ static void update_curr_rt(struct rq *rq)
 #if defined CONFIG_SMP
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
-static inline int next_prio(struct rq *rq)
-{
-        struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
-        if (next && rt_prio(next->prio))
-                return next->prio;
-        else
-                return MAX_RT_PRIO;
-}
 static void
 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (prio < prev_prio) {
+        if (rq->online && prio < prev_prio)
+                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-                /*
-                 * If the new task is higher in priority than anything on the
-                 * run-queue, we know that the previous high becomes our
-                 * next-highest.
-                 */
-                rt_rq->highest_prio.next = prev_prio;
-                if (rq->online)
-                        cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-        } else if (prio == rt_rq->highest_prio.curr)
-                /*
-                 * If the next task is equal in priority to the highest on
-                 * the run-queue, then we implicitly know that the next highest
-                 * task cannot be any lower than current
-                 */
-                rt_rq->highest_prio.next = prio;
-        else if (prio < rt_rq->highest_prio.next)
-                /*
-                 * Otherwise, we need to recompute next-highest
-                 */
-                rt_rq->highest_prio.next = next_prio(rq);
 }
 static void
@@ -746,9 +926,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
-                rt_rq->highest_prio.next = next_prio(rq);
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
@@ -961,6 +1138,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
+        inc_nr_running(rq);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,11 +1150,13 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        dequeue_rt_entity(rt_se);
        dequeue_pushable_task(rq, p);
+        dec_nr_running(rq);
 }
 /*
- * Put task to the end of the run list without the overhead of dequeue
+ * Put task to the head or the end of the run list without the overhead of
- * followed by enqueue.
+ * dequeue followed by enqueue.
 */
 static void
 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1017,10 +1198,15 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        struct rq *rq;
        int cpu;
-        if (sd_flag != SD_BALANCE_WAKE)
-                return smp_processor_id();
        cpu = task_cpu(p);
+        if (p->rt.nr_cpus_allowed == 1)
+                goto out;
+        /* For anything but wake ups, just return the task_cpu */
+        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+                goto out;
        rq = cpu_rq(cpu);
        rcu_read_lock();
@@ -1059,6 +1245,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        }
        rcu_read_unlock();
+out:
        return cpu;
 }
@@ -1178,7 +1365,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
        update_curr_rt(rq);
-        p->se.exec_start = 0;
        /*
         * The previous task needs to be made eligible for pushing
@@ -1193,12 +1379,10 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
            (p->rt.nr_cpus_allowed > 1))
                return 1;
        return 0;
@@ -1343,7 +1527,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                         */
                        if (unlikely(task_rq(task) != rq ||
                                     !cpumask_test_cpu(lowest_rq->cpu,
-                                                       &task->cpus_allowed) ||
+                                                       tsk_cpus_allowed(task)) ||
                                     task_running(rq, task) ||
                                     !task->on_rq)) {
@@ -1394,6 +1578,7 @@ static int push_rt_task(struct rq *rq)
 {
        struct task_struct *next_task;
        struct rq *lowest_rq;
+        int ret = 0;
        if (!rq->rt.overloaded)
                return 0;
@@ -1426,7 +1611,7 @@ retry:
        if (!lowest_rq) {
                struct task_struct *task;
                /*
-                 * find lock_lowest_rq releases rq->lock
+                 * find_lock_lowest_rq releases rq->lock
                 * so it is possible that next_task has migrated.
                 *
                 * We need to make sure that the task is still on the same
@@ -1436,12 +1621,11 @@ retry:
                task = pick_next_pushable_task(rq);
                if (task_cpu(next_task) == rq->cpu && task == next_task) {
                        /*
-                         * If we get here, the task hasn't moved at all, but
+                         * The task hasn't migrated, and is still the next
-                         * it has failed to push.  We will not try again,
+                         * eligible task, but we failed to find a run-queue
-                         * since the other cpus will pull from us when they
+                         * to push it to.  Do not retry in this case, since
-                         * are ready.
+                         * other cpus will pull from us when ready.
                         */
-                        dequeue_pushable_task(rq, next_task);
                        goto out;
                }
@@ -1460,6 +1644,7 @@ retry:
        deactivate_task(rq, next_task, 0);
        set_task_cpu(next_task, lowest_rq->cpu);
        activate_task(lowest_rq, next_task, 0);
+        ret = 1;
        resched_task(lowest_rq->curr);
@@ -1468,7 +1653,7 @@ retry:
 out:
        put_task_struct(next_task);
-        return 1;
+        return ret;
 }
 static void push_rt_tasks(struct rq *rq)
@@ -1626,9 +1811,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
                update_rt_migration(&rq->rt);
        }
-        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->rt.nr_cpus_allowed = weight;
 }
 /* Assumes rq->lock is held */
@@ -1670,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
                pull_rt_task(rq);
 }
-static inline void init_sched_rt_class(void)
+void init_sched_rt_class(void)
 {
        unsigned int i;
-        for_each_possible_cpu(i)
+        for_each_possible_cpu(i) {
                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
                                        GFP_KERNEL, cpu_to_node(i));
+        }
 }
 #endif /* CONFIG_SMP */
@@ -1817,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
                return 0;
 }
-static const struct sched_class rt_sched_class = {
+const struct sched_class rt_sched_class = {
        .next                   = &fair_sched_class,
        .enqueue_task           = enqueue_task_rt,
        .dequeue_task           = dequeue_task_rt,
@@ -1852,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-static void print_rt_stats(struct seq_file *m, int cpu)
+void print_rt_stats(struct seq_file *m, int cpu)
 {
        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
@@ -1863,4 +2046,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
        rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+#include "cpupri.h"
+extern __read_mostly int scheduler_running;
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
+/*
+ * Helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define NICE_0_LOAD             SCHED_LOAD_SCALE
+#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define DEF_TIMESLICE           (100 * HZ / 1000)
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF     ((u64)~0ULL)
+static inline int rt_policy(int policy)
+{
+        if (policy == SCHED_FIFO || policy == SCHED_RR)
+                return 1;
+        return 0;
+}
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+        return rt_policy(p->policy);
+}
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+        struct list_head queue[MAX_RT_PRIO];
+};
+struct rt_bandwidth {
+        /* nests inside the rq lock: */
+        raw_spinlock_t          rt_runtime_lock;
+        ktime_t                 rt_period;
+        u64                     rt_runtime;
+        struct hrtimer          rt_period_timer;
+};
+extern struct mutex sched_domains_mutex;
+#ifdef CONFIG_CGROUP_SCHED
+#include <linux/cgroup.h>
+struct cfs_rq;
+struct rt_rq;
+static LIST_HEAD(task_groups);
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+        raw_spinlock_t lock;
+        ktime_t period;
+        u64 quota, runtime;
+        s64 hierarchal_quota;
+        u64 runtime_expires;
+        int idle, timer_active;
+        struct hrtimer period_timer, slack_timer;
+        struct list_head throttled_cfs_rq;
+        /* statistics */
+        int nr_periods, nr_throttled;
+        u64 throttled_time;
+#endif
+};
+/* task group related information */
+struct task_group {
+        struct cgroup_subsys_state css;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /* schedulable entities of this group on each cpu */
+        struct sched_entity **se;
+        /* runqueue "owned" by this group on each cpu */
+        struct cfs_rq **cfs_rq;
+        unsigned long shares;
+        atomic_t load_weight;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        struct sched_rt_entity **rt_se;
+        struct rt_rq **rt_rq;
+        struct rt_bandwidth rt_bandwidth;
+#endif
+        struct rcu_head rcu;
+        struct list_head list;
+        struct task_group *parent;
+        struct list_head siblings;
+        struct list_head children;
+#ifdef CONFIG_SCHED_AUTOGROUP
+        struct autogroup *autogroup;
+#endif
+        struct cfs_bandwidth cfs_bandwidth;
+};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define ROOT_TASK_GROUP_LOAD    NICE_0_LOAD
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ *  limitation from this.)
+ */
+#define MIN_SHARES      (1UL <<  1)
+#define MAX_SHARES      (1UL << 18)
+#endif
+/* Default task group.
+ *      Every task in system belong to this group at bootup.
+ */
+extern struct task_group root_task_group;
+typedef int (*tg_visitor)(struct task_group *, void *);
+extern int walk_tg_tree_from(struct task_group *from,
+                             tg_visitor down, tg_visitor up, void *data);
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+        return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+extern int tg_nop(struct task_group *tg, void *data);
+extern void free_fair_sched_group(struct task_group *tg);
+extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+                        struct sched_entity *se, int cpu,
+                        struct sched_entity *parent);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+                struct sched_rt_entity *rt_se, int cpu,
+                struct sched_rt_entity *parent);
+#else /* CONFIG_CGROUP_SCHED */
+struct cfs_bandwidth { };
+#endif  /* CONFIG_CGROUP_SCHED */
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+        struct load_weight load;
+        unsigned long nr_running, h_nr_running;
+        u64 exec_clock;
+        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+#endif
+        struct rb_root tasks_timeline;
+        struct rb_node *rb_leftmost;
+        struct list_head tasks;
+        struct list_head *balance_iterator;
+        /*
+         * 'curr' points to currently running entity on this cfs_rq.
+         * It is set to NULL otherwise (i.e when none are currently running).
+         */
+        struct sched_entity *curr, *next, *last, *skip;
+#ifdef  CONFIG_SCHED_DEBUG
+        unsigned int nr_spread_over;
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
+        /*
+         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+         * (like users, containers etc.)
+         *
+         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+         * list is used during load balance.
+         */
+        int on_list;
+        struct list_head leaf_cfs_rq_list;
+        struct task_group *tg;  /* group that "owns" this runqueue */
+#ifdef CONFIG_SMP
+        /*
+         * the part of load.weight contributed by tasks
+         */
+        unsigned long task_weight;
+        /*
+         *   h_load = weight * f(tg)
+         *
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
+        /*
+         * Maintaining per-cpu shares distribution for group scheduling
+         *
+         * load_stamp is the last time we updated the load average
+         * load_last is the last time we updated the load average and saw load
+         * load_unacc_exec_time is currently unaccounted execution time
+         */
+        u64 load_avg;
+        u64 load_period;
+        u64 load_stamp, load_last, load_unacc_exec_time;
+        unsigned long load_contribution;
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_CFS_BANDWIDTH
+        int runtime_enabled;
+        u64 runtime_expires;
+        s64 runtime_remaining;
+        u64 throttled_timestamp;
+        int throttled, throttle_count;
+        struct list_head throttled_list;
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+};
+static inline int rt_bandwidth_enabled(void)
+{
+        return sysctl_sched_rt_runtime >= 0;
+}
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+        struct rt_prio_array active;
+        unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+        struct {
+                int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+                int next; /* next highest */
+#endif
+        } highest_prio;
+#endif
+#ifdef CONFIG_SMP
+        unsigned long rt_nr_migratory;
+        unsigned long rt_nr_total;
+        int overloaded;
+        struct plist_head pushable_tasks;
+#endif
+        int rt_throttled;
+        u64 rt_time;
+        u64 rt_runtime;
+        /* Nests inside the rq lock: */
+        raw_spinlock_t rt_runtime_lock;
+#ifdef CONFIG_RT_GROUP_SCHED
+        unsigned long rt_nr_boosted;
+        struct rq *rq;
+        struct list_head leaf_rt_rq_list;
+        struct task_group *tg;
+#endif
+};
+#ifdef CONFIG_SMP
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+        atomic_t refcount;
+        atomic_t rto_count;
+        struct rcu_head rcu;
+        cpumask_var_t span;
+        cpumask_var_t online;
+        /*
+         * The "RT overload" flag: it gets set if a CPU has more than
+         * one runnable RT task.
+         */
+        cpumask_var_t rto_mask;
+        struct cpupri cpupri;
+};
+extern struct root_domain def_root_domain;
+#endif /* CONFIG_SMP */
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+        /* runqueue lock: */
+        raw_spinlock_t lock;
+        /*
+         * nr_running and cpu_load should be in the same cacheline because
+         * remote CPUs use both these fields when doing load calculation.
+         */
+        unsigned long nr_running;
+        #define CPU_LOAD_IDX_MAX 5
+        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+        unsigned long last_load_update_tick;
+#ifdef CONFIG_NO_HZ
+        u64 nohz_stamp;
+        unsigned long nohz_flags;
+#endif
+        int skip_clock_update;
+        /* capture load from *all* tasks on this cpu: */
+        struct load_weight load;
+        unsigned long nr_load_updates;
+        u64 nr_switches;
+        struct cfs_rq cfs;
+        struct rt_rq rt;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /* list of leaf cfs_rq on this cpu: */
+        struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        struct list_head leaf_rt_rq_list;
+#endif
+        /*
+         * This is part of a global counter where only the total sum
+         * over all CPUs matters. A task can increase this counter on
+         * one CPU and if it got migrated afterwards it may decrease
+         * it on another CPU. Always updated under the runqueue lock:
+         */
+        unsigned long nr_uninterruptible;
+        struct task_struct *curr, *idle, *stop;
+        unsigned long next_balance;
+        struct mm_struct *prev_mm;
+        u64 clock;
+        u64 clock_task;
+        atomic_t nr_iowait;
+#ifdef CONFIG_SMP
+        struct root_domain *rd;
+        struct sched_domain *sd;
+        unsigned long cpu_power;
+        unsigned char idle_balance;
+        /* For active balancing */
+        int post_schedule;
+        int active_balance;
+        int push_cpu;
+        struct cpu_stop_work active_balance_work;
+        /* cpu of this runqueue: */
+        int cpu;
+        int online;
+        u64 rt_avg;
+        u64 age_stamp;
+        u64 idle_stamp;
+        u64 avg_idle;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        u64 prev_irq_time;
+#endif
+#ifdef CONFIG_PARAVIRT
+        u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        u64 prev_steal_time_rq;
+#endif
+        /* calc_load related fields */
+        unsigned long calc_load_update;
+        long calc_load_active;
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+        int hrtick_csd_pending;
+        struct call_single_data hrtick_csd;
+#endif
+        struct hrtimer hrtick_timer;
+#endif
+#ifdef CONFIG_SCHEDSTATS
+        /* latency stats */
+        struct sched_info rq_sched_info;
+        unsigned long long rq_cpu_time;
+        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+        /* sys_sched_yield() stats */
+        unsigned int yld_count;
+        /* schedule() stats */
+        unsigned int sched_switch;
+        unsigned int sched_count;
+        unsigned int sched_goidle;
+        /* try_to_wake_up() stats */
+        unsigned int ttwu_count;
+        unsigned int ttwu_local;
+#endif
+#ifdef CONFIG_SMP
+        struct llist_head wake_list;
+#endif
+};
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+        return rq->cpu;
+#else
+        return 0;
+#endif
+}
+DECLARE_PER_CPU(struct rq, runqueues);
+#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
+#define this_rq()               (&__get_cpu_var(runqueues))
+#define task_rq(p)              cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
+#define raw_rq()                (&__raw_get_cpu_var(runqueues))
+#ifdef CONFIG_SMP
+#define rcu_dereference_check_sched_domain(p) \
+        rcu_dereference_check((p), \
+                              lockdep_is_held(&sched_domains_mutex))
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+                        __sd; __sd = __sd->parent)
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+/**
+ * highest_flag_domain - Return highest sched_domain containing flag.
+ * @cpu:        The cpu whose highest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the highest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the highest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd, *hsd = NULL;
+        for_each_domain(cpu, sd) {
+                if (!(sd->flags & flag))
+                        break;
+                hsd = sd;
+        }
+        return hsd;
+}
+DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_id);
+#endif /* CONFIG_SMP */
+#include "stats.h"
+#include "auto_group.h"
+#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        struct task_group *tg;
+        struct cgroup_subsys_state *css;
+        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                        lockdep_is_held(&p->pi_lock) ||
+                        lockdep_is_held(&task_rq(p)->lock));
+        tg = container_of(css, struct task_group, css);
+        return autogroup_task_group(p, tg);
+}
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
+        struct task_group *tg = task_group(p);
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        p->se.cfs_rq = tg->cfs_rq[cpu];
+        p->se.parent = tg->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        p->rt.rt_rq  = tg->rt_rq[cpu];
+        p->rt.parent = tg->rt_se[cpu];
+#endif
+}
+#else /* CONFIG_CGROUP_SCHED */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        return NULL;
+}
+#endif /* CONFIG_CGROUP_SCHED */
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+        set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+        /*
+         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+         * successfuly executed on another CPU. We must ensure that updates of
+         * per-task data have been completed by this moment.
+         */
+        smp_wmb();
+        task_thread_info(p)->cpu = cpu;
+#endif
+}
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/jump_label.h>
+# define const_debug __read_mostly
+#else
+# define const_debug const
+#endif
+extern const_debug unsigned int sysctl_sched_features;
+#define SCHED_FEAT(name, enabled)       \
+        __SCHED_FEAT_##name ,
+enum {
+#include "features.h"
+        __SCHED_FEAT_NR,
+};
+#undef SCHED_FEAT
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+static __always_inline bool static_branch__true(struct jump_label_key *key)
+{
+        return likely(static_branch(key)); /* Not out of line branch. */
+}
+static __always_inline bool static_branch__false(struct jump_label_key *key)
+{
+        return unlikely(static_branch(key)); /* Out of line branch. */
+}
+#define SCHED_FEAT(name, enabled)                                       \
+static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+{                                                                       \
+        return static_branch__##enabled(key);                           \
+}
+#include "features.h"
+#undef SCHED_FEAT
+extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+static inline u64 global_rt_period(void)
+{
+        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+static inline u64 global_rt_runtime(void)
+{
+        if (sysctl_sched_rt_runtime < 0)
+                return RUNTIME_INF;
+        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+        return rq->curr == p;
+}
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+        return p->on_cpu;
+#else
+        return task_current(rq, p);
+#endif
+}
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)      do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)       do { } while (0)
+#endif
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
+}
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+        /* this is a valid case when another task releases the spinlock */
+        rq->lock.owner = current;
+#endif
+        /*
+         * If we are tracking spinlock dependencies then we have to
+         * fix up the runqueue lock - which gets 'carried over' from
+         * prev into current:
+         */
+        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+        raw_spin_unlock_irq(&rq->lock);
+}
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        raw_spin_unlock_irq(&rq->lock);
+#else
+        raw_spin_unlock(&rq->lock);
+#endif
+}
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+        lw->weight += inc;
+        lw->inv_weight = 0;
+}
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+        lw->weight -= dec;
+        lw->inv_weight = 0;
+}
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+        lw->weight = w;
+        lw->inv_weight = 0;
+}
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+static const int prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+static const u32 prio_to_wmult[40] = {
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+        CPUACCT_STAT_USER,      /* ... user mode */
+        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+        CPUACCT_STAT_NSTATS,
+};
+#define sched_class_highest (&stop_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
+extern const struct sched_class stop_sched_class;
+extern const struct sched_class rt_sched_class;
+extern const struct sched_class fair_sched_class;
+extern const struct sched_class idle_sched_class;
+#ifdef CONFIG_SMP
+extern void trigger_load_balance(struct rq *rq, int cpu);
+extern void idle_balance(int this_cpu, struct rq *this_rq);
+#else   /* CONFIG_SMP */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
+#endif
+extern void sysrq_sched_debug_show(void);
+extern void sched_init_granularity(void);
+extern void update_max_interval(void);
+extern void update_group_power(struct sched_domain *sd, int cpu);
+extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
+extern void init_sched_rt_class(void);
+extern void init_sched_fair_class(void);
+extern void resched_task(struct task_struct *p);
+extern void resched_cpu(int cpu);
+extern struct rt_bandwidth def_rt_bandwidth;
+extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern void update_cpu_load(struct rq *this_rq);
+#ifdef CONFIG_CGROUP_CPUACCT
+#include <linux/cgroup.h>
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+        struct cgroup_subsys_state css;
+        /* cpuusage holds pointer to a u64-type object on every cpu */
+        u64 __percpu *cpuusage;
+        struct kernel_cpustat __percpu *cpustat;
+};
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+        if (!ca || !ca->css.cgroup->parent)
+                return NULL;
+        return cgroup_ca(ca->css.cgroup->parent);
+}
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+#endif
+static inline void inc_nr_running(struct rq *rq)
+{
+        rq->nr_running++;
+}
+static inline void dec_nr_running(struct rq *rq)
+{
+        rq->nr_running--;
+}
+extern void update_rq_clock(struct rq *rq);
+extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern const_debug unsigned int sysctl_sched_time_avg;
+extern const_debug unsigned int sysctl_sched_nr_migrate;
+extern const_debug unsigned int sysctl_sched_migration_cost;
+static inline u64 sched_avg_period(void)
+{
+        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+void calc_load_account_idle(struct rq *this_rq);
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+        if (!sched_feat(HRTICK))
+                return 0;
+        if (!cpu_active(cpu_of(rq)))
+                return 0;
+        return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+void hrtick_start(struct rq *rq, u64 delay);
+#else
+static inline int hrtick_enabled(struct rq *rq)
+{
+        return 0;
+}
+#endif /* CONFIG_SCHED_HRTICK */
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+        rq->rt_avg += rt_delta;
+        sched_avg_update(rq);
+}
+#else
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
+static inline void sched_avg_update(struct rq *rq) { }
+#endif
+extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+/*
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(this_rq->lock)
+        __acquires(busiest->lock)
+        __acquires(this_rq->lock)
+{
+        raw_spin_unlock(&this_rq->lock);
+        double_rq_lock(this_rq, busiest);
+        return 1;
+}
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(this_rq->lock)
+        __acquires(busiest->lock)
+        __acquires(this_rq->lock)
+{
+        int ret = 0;
+        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
+                if (busiest < this_rq) {
+                        raw_spin_unlock(&this_rq->lock);
+                        raw_spin_lock(&busiest->lock);
+                        raw_spin_lock_nested(&this_rq->lock,
+                                              SINGLE_DEPTH_NESTING);
+                        ret = 1;
+                } else
+                        raw_spin_lock_nested(&busiest->lock,
+                                              SINGLE_DEPTH_NESTING);
+        }
+        return ret;
+}
+#endif /* CONFIG_PREEMPT */
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+        if (unlikely(!irqs_disabled())) {
+                /* printk() doesn't work good under rq->lock */
+                raw_spin_unlock(&this_rq->lock);
+                BUG_ON(1);
+        }
+        return _double_lock_balance(this_rq, busiest);
+}
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(busiest->lock)
+{
+        raw_spin_unlock(&busiest->lock);
+        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        if (rq1 == rq2) {
+                raw_spin_lock(&rq1->lock);
+                __acquire(rq2->lock);   /* Fake it out ;) */
+        } else {
+                if (rq1 < rq2) {
+                        raw_spin_lock(&rq1->lock);
+                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                } else {
+                        raw_spin_lock(&rq2->lock);
+                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                }
+        }
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        raw_spin_unlock(&rq1->lock);
+        if (rq1 != rq2)
+                raw_spin_unlock(&rq2->lock);
+        else
+                __release(rq2->lock);
+}
+#else /* CONFIG_SMP */
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        BUG_ON(rq1 != rq2);
+        raw_spin_lock(&rq1->lock);
+        __acquire(rq2->lock);   /* Fake it out ;) */
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        BUG_ON(rq1 != rq2);
+        raw_spin_unlock(&rq1->lock);
+        __release(rq2->lock);
+}
+#endif
+extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
+extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void print_cfs_stats(struct seq_file *m, int cpu);
+extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void init_cfs_rq(struct cfs_rq *cfs_rq);
+extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void unthrottle_offline_cfs_rqs(struct rq *rq);
+extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+#ifdef CONFIG_NO_HZ
+enum rq_nohz_flag_bits {
+        NOHZ_TICK_STOPPED,
+        NOHZ_BALANCE_KICK,
+        NOHZ_IDLE,
+};
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "sched.h"
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 15
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+        int cpu;
+        int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
+        char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+        if (mask_str == NULL)
+                return -ENOMEM;
+        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+        seq_printf(seq, "timestamp %lu\n", jiffies);
+        for_each_online_cpu(cpu) {
+                struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+                struct sched_domain *sd;
+                int dcount = 0;
+#endif
+                /* runqueue-specific stats */
+                seq_printf(seq,
+                    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+                    cpu, rq->yld_count,
+                    rq->sched_switch, rq->sched_count, rq->sched_goidle,
+                    rq->ttwu_count, rq->ttwu_local,
+                    rq->rq_cpu_time,
+                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+                seq_printf(seq, "\n");
+#ifdef CONFIG_SMP
+                /* domain-specific stats */
+                rcu_read_lock();
+                for_each_domain(cpu, sd) {
+                        enum cpu_idle_type itype;
+                        cpumask_scnprintf(mask_str, mask_len,
+                                          sched_domain_span(sd));
+                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
+                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+                                        itype++) {
+                                seq_printf(seq, " %u %u %u %u %u %u %u %u",
+                                    sd->lb_count[itype],
+                                    sd->lb_balanced[itype],
+                                    sd->lb_failed[itype],
+                                    sd->lb_imbalance[itype],
+                                    sd->lb_gained[itype],
+                                    sd->lb_hot_gained[itype],
+                                    sd->lb_nobusyq[itype],
+                                    sd->lb_nobusyg[itype]);
+                        }
+                        seq_printf(seq,
+                                   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+                            sd->alb_count, sd->alb_failed, sd->alb_pushed,
+                            sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+                            sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                            sd->ttwu_move_balance);
+                }
+                rcu_read_unlock();
+#endif
+        }
+        kfree(mask_str);
+        return 0;
+}
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+        char *buf = kmalloc(size, GFP_KERNEL);
+        struct seq_file *m;
+        int res;
+        if (!buf)
+                return -ENOMEM;
+        res = single_open(file, show_schedstat, NULL);
+        if (!res) {
+                m = file->private_data;
+                m->buf = buf;
+                m->size = size;
+        } else
+                kfree(buf);
+        return res;
+}
+static const struct file_operations proc_schedstat_operations = {
+        .open    = schedstat_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+static int __init proc_schedstat_init(void)
+{
+        proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+        return 0;
+}
+module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 331e01bcd026..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
 #ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 15
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-        int cpu;
-        int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
-        char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-        if (mask_str == NULL)
-                return -ENOMEM;
-        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-        seq_printf(seq, "timestamp %lu\n", jiffies);
-        for_each_online_cpu(cpu) {
-                struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-                struct sched_domain *sd;
-                int dcount = 0;
-#endif
-                /* runqueue-specific stats */
-                seq_printf(seq,
-                    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
-                    cpu, rq->yld_count,
-                    rq->sched_switch, rq->sched_count, rq->sched_goidle,
-                    rq->ttwu_count, rq->ttwu_local,
-                    rq->rq_cpu_time,
-                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-                seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
-                /* domain-specific stats */
-                rcu_read_lock();
-                for_each_domain(cpu, sd) {
-                        enum cpu_idle_type itype;
-                        cpumask_scnprintf(mask_str, mask_len,
-                                          sched_domain_span(sd));
-                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
-                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-                                        itype++) {
-                                seq_printf(seq, " %u %u %u %u %u %u %u %u",
-                                    sd->lb_count[itype],
-                                    sd->lb_balanced[itype],
-                                    sd->lb_failed[itype],
-                                    sd->lb_imbalance[itype],
-                                    sd->lb_gained[itype],
-                                    sd->lb_hot_gained[itype],
-                                    sd->lb_nobusyq[itype],
-                                    sd->lb_nobusyg[itype]);
-                        }
-                        seq_printf(seq,
-                                   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-                            sd->alb_count, sd->alb_failed, sd->alb_pushed,
-                            sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-                            sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
-                            sd->ttwu_move_balance);
-                }
-                rcu_read_unlock();
-#endif
-        }
-        kfree(mask_str);
-        return 0;
-}
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-        char *buf = kmalloc(size, GFP_KERNEL);
-        struct seq_file *m;
-        int res;
-        if (!buf)
-                return -ENOMEM;
-        res = single_open(file, show_schedstat, NULL);
-        if (!res) {
-                m = file->private_data;
-                m->buf = buf;
-                m->size = size;
-        } else
-                kfree(buf);
-        return res;
-}
-static const struct file_operations proc_schedstat_operations = {
-        .open    = schedstat_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = single_release,
-};
-static int __init proc_schedstat_init(void)
-{
-        proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-        return 0;
-}
-module_init(proc_schedstat_init);
 /*
 * Expects runqueue lock to be held for atomicity of update
@@ -282,10 +179,9 @@ static inline void account_group_user_time(struct task_struct *tsk,
        if (!cputimer->running)
                return;
-        spin_lock(&cputimer->lock);
+        raw_spin_lock(&cputimer->lock);
-        cputimer->cputime.utime =
+        cputimer->cputime.utime += cputime;
-                cputime_add(cputimer->cputime.utime, cputime);
+        raw_spin_unlock(&cputimer->lock);
-        spin_unlock(&cputimer->lock);
 }
 /**
@@ -306,10 +202,9 @@ static inline void account_group_system_time(struct task_struct *tsk,
        if (!cputimer->running)
                return;
-        spin_lock(&cputimer->lock);
+        raw_spin_lock(&cputimer->lock);
-        cputimer->cputime.stime =
+        cputimer->cputime.stime += cputime;
-                cputime_add(cputimer->cputime.stime, cputime);
+        raw_spin_unlock(&cputimer->lock);
-        spin_unlock(&cputimer->lock);
 }
 /**
@@ -330,7 +225,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
        if (!cputimer->running)
                return;
-        spin_lock(&cputimer->lock);
+        raw_spin_lock(&cputimer->lock);
        cputimer->cputime.sum_exec_runtime += ns;
-        spin_unlock(&cputimer->lock);
+        raw_spin_unlock(&cputimer->lock);
 }
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 6f437632afab..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
+#include "sched.h"
 /*
 * stop-task scheduling class.
 *
@@ -34,11 +36,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+        inc_nr_running(rq);
 }
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+        dec_nr_running(rq);
 }
 static void yield_task_stop(struct rq *rq)
@@ -78,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
 /*
 * Simple, special scheduling class for the per-CPU stop tasks:
 */
-static const struct sched_class stop_sched_class = {
+const struct sched_class stop_sched_class = {
        .next                   = &rt_sched_class,
        .enqueue_task           = enqueue_task_stop,
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ade..60636a4e25c3 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -27,7 +27,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
 {
        unsigned long flags;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(down);
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
        unsigned long flags;
        int result = 0;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_interruptible(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
        unsigned long flags;
        int result = 0;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_killable(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
        unsigned long flags;
        int count;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        count = sem->count - 1;
        if (likely(count >= 0))
                sem->count = count;
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return (count < 0);
 }
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
        unsigned long flags;
        int result = 0;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_timeout(sem, jiffies);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
 {
        unsigned long flags;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(up);
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
                if (timeout <= 0)
                        goto timed_out;
                __set_task_state(task, state);
-                spin_unlock_irq(&sem->lock);
+                raw_spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
-                spin_lock_irq(&sem->lock);
+                raw_spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }
diff --git a/kernel/signal.c b/kernel/signal.c
index 291c9700be75..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -11,7 +11,7 @@
 */
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
@@ -28,6 +28,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+                                struct user_namespace *ns)
+{
+        return user_ns_map_uid(ns, cred, cred->uid);
+}
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+        if (current_user_ns() == task_cred_xxx(t, user_ns))
+                return;
+        if (SI_FROMKERNEL(info))
+                return;
+        info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+                                        current_cred(), info->si_uid);
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+        return;
+}
+#endif
 static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        int group, int from_ancestor_ns)
 {
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                                q->info.si_pid = 0;
                        break;
                }
+                userns_fixup_signal_uid(&q->info, t);
        } else if (!is_si_special(info)) {
                if (sig >= SIGRTMIN && info->si_code != SI_USER) {
                        /*
@@ -1344,13 +1376,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
        return error;
 }
+static int kill_as_cred_perm(const struct cred *cred,
+                             struct task_struct *target)
+{
+        const struct cred *pcred = __task_cred(target);
+        if (cred->user_ns != pcred->user_ns)
+                return 0;
+        if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
+            cred->uid  != pcred->suid && cred->uid  != pcred->uid)
+                return 0;
+        return 1;
+}
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
+int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
-                      uid_t uid, uid_t euid, u32 secid)
+                         const struct cred *cred, u32 secid)
 {
        int ret = -EINVAL;
        struct task_struct *p;
-        const struct cred *pcred;
        unsigned long flags;
        if (!valid_signal(sig))
@@ -1362,10 +1405,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
                ret = -ESRCH;
                goto out_unlock;
        }
-        pcred = __task_cred(p);
+        if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
-        if (si_fromuser(info) &&
-            euid != pcred->suid && euid != pcred->uid &&
-            uid  != pcred->suid && uid  != pcred->uid) {
                ret = -EPERM;
                goto out_unlock;
        }
@@ -1384,7 +1424,7 @@ out_unlock:
        rcu_read_unlock();
        return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
 /*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1618,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
-        info.si_uid = __task_cred(tsk)->uid;
+        info.si_uid = map_cred_ns(__task_cred(tsk),
+                        task_cred_xxx(tsk->parent, user_ns));
        rcu_read_unlock();
-        info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
+        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
-                                tsk->signal->utime));
+        info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
-        info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
-                                tsk->signal->stime));
        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
@@ -1703,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
-        info.si_uid = __task_cred(tsk)->uid;
+        info.si_uid = map_cred_ns(__task_cred(tsk),
+                        task_cred_xxx(parent, user_ns));
        rcu_read_unlock();
        info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1986,8 +2026,6 @@ static bool do_signal_stop(int signr)
                 */
                if (!(sig->flags & SIGNAL_STOP_STOPPED))
                        sig->group_exit_code = signr;
-                else
-                        WARN_ON_ONCE(!current->ptrace);
                sig->group_stop_count = 0;
@@ -2121,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
+                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
-                info->si_uid = task_uid(current->parent);
+                info->si_uid = map_cred_ns(__task_cred(current->parent),
+                                current_user_ns());
+                rcu_read_unlock();
        }
        /* If the (new) signal is now blocked, requeue it.  */
@@ -2314,6 +2355,27 @@ relock:
        return signr;
 }
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+        sigset_t blocked;
+        sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+        if (!(ka->sa.sa_flags & SA_NODEFER))
+                sigaddset(&blocked, signr);
+        set_current_blocked(&blocked);
+}
 /*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
@@ -2351,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
        int group_stop = 0;
        sigset_t unblocked;
+        /*
+         * @tsk is about to have PF_EXITING set - lock out users which
+         * expect stable threadgroup.
+         */
+        threadgroup_change_begin(tsk);
        if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
                tsk->flags |= PF_EXITING;
+                threadgroup_change_end(tsk);
                return;
        }
@@ -2362,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
         * see wants_signal(), do_signal_stop().
         */
        tsk->flags |= PF_EXITING;
+        threadgroup_change_end(tsk);
        if (!signal_pending(tsk))
                goto out;
diff --git a/kernel/smp.c b/kernel/smp.c
index fb67dfa8394e..db197d60489b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -6,7 +6,7 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/gfp.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c32042b..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -10,7 +10,7 @@
 *      Remote softirq infrastructure is by Jens Axboe.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
@@ -347,12 +347,12 @@ void irq_exit(void)
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
-        rcu_irq_exit();
 #ifdef CONFIG_NO_HZ
        /* Make sure that timer wheel updates are propagated */
        if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
-                tick_nohz_stop_sched_tick(0);
+                tick_nohz_irq_exit();
 #endif
+        rcu_irq_exit();
        preempt_enable_no_resched();
 }
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index be6517fb9c14..84c7d96918bf 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -19,7 +19,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include <linux/module.h>
+#include <linux/export.h>
 /*
 * If lockdep is enabled then we use the non-preemption spin-ops
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 73ce23feaea9..0febf61e1aa3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -24,7 +24,7 @@
 *
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index d20c6983aad9..00fe55cc5a82 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -7,7 +7,7 @@
 */
 #include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba5070ce5765..2f194e965715 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -12,7 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/init.h>
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
@@ -41,6 +41,7 @@ struct cpu_stopper {
 };
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static bool stop_machine_initialized = false;
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
@@ -386,6 +387,8 @@ static int __init cpu_stop_init(void)
        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
        register_cpu_notifier(&cpu_stop_cpu_notifier);
+        stop_machine_initialized = true;
        return 0;
 }
 early_initcall(cpu_stop_init);
@@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
                                            .num_threads = num_online_cpus(),
                                            .active_cpus = cpus };
+        if (!stop_machine_initialized) {
+                /*
+                 * Handle the case where stop_machine() is called
+                 * early in boot before stop_machine() has been
+                 * initialized.
+                 */
+                unsigned long flags;
+                int ret;
+                WARN_ON_ONCE(smdata.num_threads != 1);
+                local_irq_save(flags);
+                hard_irq_disable();
+                ret = (*fn)(data);
+                local_irq_restore(flags);
+                return ret;
+        }
        /* Set the initial state and stop all online cpus. */
        set_state(&smdata, STOPMACHINE_PREPARE);
        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1dbbe695a5ef..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
@@ -12,6 +12,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kmod.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
@@ -1286,6 +1287,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
        }
+        uts_proc_notify(UTS_PROC_HOSTNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1336,6 +1338,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
        }
+        uts_proc_notify(UTS_PROC_DOMAINNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1602,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        unsigned long maxrss = 0;
        memset((char *) r, 0, sizeof *r);
-        utime = stime = cputime_zero;
+        utime = stime = 0;
        if (who == RUSAGE_THREAD) {
                task_times(current, &utime, &stime);
@@ -1632,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                case RUSAGE_SELF:
                        thread_group_times(p, &tgutime, &tgstime);
-                        utime = cputime_add(utime, tgutime);
+                        utime += tgutime;
-                        stime = cputime_add(stime, tgstime);
+                        stime += tgstime;
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
@@ -1689,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
        return mask;
 }
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+                        unsigned long arg4, unsigned long arg5)
+{
+        unsigned long rlim = rlimit(RLIMIT_DATA);
+        unsigned long vm_req_flags;
+        unsigned long vm_bad_flags;
+        struct vm_area_struct *vma;
+        int error = 0;
+        struct mm_struct *mm = current->mm;
+        if (arg4 | arg5)
+                return -EINVAL;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (addr >= TASK_SIZE)
+                return -EINVAL;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, addr);
+        if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+                /* It must be existing VMA */
+                if (!vma || vma->vm_start > addr)
+                        goto out;
+        }
+        error = -EINVAL;
+        switch (opt) {
+        case PR_SET_MM_START_CODE:
+        case PR_SET_MM_END_CODE:
+                vm_req_flags = VM_READ | VM_EXEC;
+                vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+                if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                    (vma->vm_flags & vm_bad_flags))
+                        goto out;
+                if (opt == PR_SET_MM_START_CODE)
+                        mm->start_code = addr;
+                else
+                        mm->end_code = addr;
+                break;
+        case PR_SET_MM_START_DATA:
+        case PR_SET_MM_END_DATA:
+                vm_req_flags = VM_READ | VM_WRITE;
+                vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+                if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+                    (vma->vm_flags & vm_bad_flags))
+                        goto out;
+                if (opt == PR_SET_MM_START_DATA)
+                        mm->start_data = addr;
+                else
+                        mm->end_data = addr;
+                break;
+        case PR_SET_MM_START_STACK:
+#ifdef CONFIG_STACK_GROWSUP
+                vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+                vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+                if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+                        goto out;
+                mm->start_stack = addr;
+                break;
+        case PR_SET_MM_START_BRK:
+                if (addr <= mm->end_data)
+                        goto out;
+                if (rlim < RLIM_INFINITY &&
+                    (mm->brk - addr) +
+                    (mm->end_data - mm->start_data) > rlim)
+                        goto out;
+                mm->start_brk = addr;
+                break;
+        case PR_SET_MM_BRK:
+                if (addr <= mm->end_data)
+                        goto out;
+                if (rlim < RLIM_INFINITY &&
+                    (addr - mm->start_brk) +
+                    (mm->end_data - mm->start_data) > rlim)
+                        goto out;
+                mm->brk = addr;
+                break;
+        default:
+                error = -EINVAL;
+                goto out;
+        }
+        error = 0;
+out:
+        up_read(&mm->mmap_sem);
+        return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+                        unsigned long arg4, unsigned long arg5)
+{
+        return -EINVAL;
+}
+#endif
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                unsigned long, arg4, unsigned long, arg5)
 {
@@ -1759,6 +1880,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                              sizeof(me->comm) - 1) < 0)
                                return -EFAULT;
                        set_task_comm(me, comm);
+                        proc_comm_connector(me);
                        return 0;
                case PR_GET_NAME:
                        get_task_comm(comm, me);
@@ -1837,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        else
                                error = PR_MCE_KILL_DEFAULT;
                        break;
+                case PR_SET_MM:
+                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a9a5de07c4f1..47bfa16430d7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -145,6 +145,10 @@ cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
 cond_syscall(sys_syslog);
+cond_syscall(sys_process_vm_readv);
+cond_syscall(sys_process_vm_writev);
+cond_syscall(compat_sys_process_vm_readv);
+cond_syscall(compat_sys_process_vm_writev);
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -57,6 +57,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/kmod.h>
+#include <linux/capability.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -134,6 +135,7 @@ static int minolduid;
 static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
+static const int cap_last_cap = CAP_LAST_CAP;
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
@@ -151,14 +153,6 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
-#ifdef CONFIG_S390
-#ifdef CONFIG_MATHEMU
-extern int sysctl_ieee_emulation_warnings;
-#endif
-extern int sysctl_userprocess_debug;
-extern int spin_retry;
-#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -379,6 +373,16 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        {
+                .procname       = "sched_cfs_bandwidth_slice_us",
+                .data           = &sysctl_sched_cfs_bandwidth_slice,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
+        },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
@@ -730,6 +734,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "cap_last_cap",
+                .data           = (void *)&cap_last_cap,
+                .maxlen         = sizeof(int),
+                .mode           = 0444,
+                .proc_handler   = proc_dointvec,
+        },
 #if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
@@ -792,6 +803,15 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+        {
+                .procname       = "panic_on_stackoverflow",
+                .data           = &sysctl_panic_on_stackoverflow,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+#endif
        {
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8bffbe2ba4b..a650694883a1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL,         "gc_min_interval" },
        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,      "gc_min_interval_ms" },
        { CTL_INT,      NET_IPV4_ROUTE_GC_TIMEOUT,              "gc_timeout" },
-        { CTL_INT,      NET_IPV4_ROUTE_GC_INTERVAL,             "gc_interval" },
+        /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_LOAD,           "redirect_load" },
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_NUMBER,         "redirect_number" },
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_SILENCE,        "redirect_silence" },
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        fput(file);
 out_putname:
-        putname(pathname);
+        __putname(pathname);
 out:
        return result;
 }
diff --git a/kernel/time.c b/kernel/time.c
index 8e8dc6d705c9..73e416db0a1e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -27,7 +27,7 @@
 *      with nanosecond accuracy
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/timex.h>
 #include <linux/capability.h>
 #include <linux/clocksource.h>
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval);
 /*
 * Convert jiffies/jiffies_64 to clock_t and back.
 */
-clock_t jiffies_to_clock_t(long x)
+clock_t jiffies_to_clock_t(unsigned long x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
 # if HZ < USER_HZ
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index f06a8a365648..2cf9cc7aa103 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,5 +25,7 @@ config HIGH_RES_TIMERS
 config GENERIC_CLOCKEVENTS_BUILD
        bool
        default y
-        depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
+        depends on GENERIC_CLOCKEVENTS
+config GENERIC_CLOCKEVENTS_MIN_ADJUST
+        bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ea5e1a928d5b..8a46f5d64504 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -53,27 +53,6 @@ static struct rtc_device	*rtcdev;
 static DEFINE_SPINLOCK(rtcdev_lock);
 /**
- * has_wakealarm - check rtc device has wakealarm ability
- * @dev: current device
- * @name_ptr: name to be returned
- *
- * This helper function checks to see if the rtc device can wake
- * from suspend.
- */
-static int has_wakealarm(struct device *dev, void *name_ptr)
-{
-        struct rtc_device *candidate = to_rtc_device(dev);
-        if (!candidate->ops->set_alarm)
-                return 0;
-        if (!device_may_wakeup(candidate->dev.parent))
-                return 0;
-        *(const char **)name_ptr = dev_name(dev);
-        return 1;
-}
-/**
 * alarmtimer_get_rtcdev - Return selected rtcdevice
 *
 * This function returns the rtc device to use for wakealarms.
@@ -82,37 +61,64 @@ static int has_wakealarm(struct device *dev, void *name_ptr)
 */
 static struct rtc_device *alarmtimer_get_rtcdev(void)
 {
-        struct device *dev;
-        char *str;
        unsigned long flags;
        struct rtc_device *ret;
        spin_lock_irqsave(&rtcdev_lock, flags);
-        if (!rtcdev) {
-                /* Find an rtc device and init the rtc_timer */
-                dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
-                /* If we have a device then str is valid. See has_wakealarm() */
-                if (dev) {
-                        rtcdev = rtc_class_open(str);
-                        /*
-                         * Drop the reference we got in class_find_device,
-                         * rtc_open takes its own.
-                         */
-                        put_device(dev);
-                        rtc_timer_init(&rtctimer, NULL, NULL);
-                }
-        }
        ret = rtcdev;
        spin_unlock_irqrestore(&rtcdev_lock, flags);
        return ret;
 }
+static int alarmtimer_rtc_add_device(struct device *dev,
+                                struct class_interface *class_intf)
+{
+        unsigned long flags;
+        struct rtc_device *rtc = to_rtc_device(dev);
+        if (rtcdev)
+                return -EBUSY;
+        if (!rtc->ops->set_alarm)
+                return -1;
+        if (!device_may_wakeup(rtc->dev.parent))
+                return -1;
+        spin_lock_irqsave(&rtcdev_lock, flags);
+        if (!rtcdev) {
+                rtcdev = rtc;
+                /* hold a reference so it doesn't go away */
+                get_device(dev);
+        }
+        spin_unlock_irqrestore(&rtcdev_lock, flags);
+        return 0;
+}
+static struct class_interface alarmtimer_rtc_interface = {
+        .add_dev = &alarmtimer_rtc_add_device,
+};
+static int alarmtimer_rtc_interface_setup(void)
+{
+        alarmtimer_rtc_interface.class = rtc_class;
+        return class_interface_register(&alarmtimer_rtc_interface);
+}
+static void alarmtimer_rtc_interface_remove(void)
+{
+        class_interface_unregister(&alarmtimer_rtc_interface);
+}
 #else
-#define alarmtimer_get_rtcdev() (0)
+static inline struct rtc_device *alarmtimer_get_rtcdev(void)
-#define rtcdev (0)
+{
+        return NULL;
+}
+#define rtcdev (NULL)
+static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
+static inline void alarmtimer_rtc_interface_remove(void) { }
 #endif
 /**
 * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
 * @base: pointer to the base where the timer is being run
@@ -126,6 +132,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void)
 static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
 {
        timerqueue_add(&base->timerqueue, &alarm->node);
+        alarm->state |= ALARMTIMER_STATE_ENQUEUED;
        if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
                hrtimer_try_to_cancel(&base->timer);
                hrtimer_start(&base->timer, alarm->node.expires,
@@ -147,7 +155,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
 {
        struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
+        if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
+                return;
        timerqueue_del(&base->timerqueue, &alarm->node);
+        alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
        if (next == &alarm->node) {
                hrtimer_try_to_cancel(&base->timer);
                next = timerqueue_getnext(&base->timerqueue);
@@ -174,6 +187,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
        unsigned long flags;
        ktime_t now;
        int ret = HRTIMER_NORESTART;
+        int restart = ALARMTIMER_NORESTART;
        spin_lock_irqsave(&base->lock, flags);
        now = base->gettime();
@@ -181,23 +195,25 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
                struct alarm *alarm;
                ktime_t expired = next->expires;
-                if (expired.tv64 >= now.tv64)
+                if (expired.tv64 > now.tv64)
                        break;
                alarm = container_of(next, struct alarm, node);
                timerqueue_del(&base->timerqueue, &alarm->node);
-                alarm->enabled = 0;
+                alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-                /* Re-add periodic timers */
-                if (alarm->period.tv64) {
+                alarm->state |= ALARMTIMER_STATE_CALLBACK;
-                        alarm->node.expires = ktime_add(expired, alarm->period);
-                        timerqueue_add(&base->timerqueue, &alarm->node);
-                        alarm->enabled = 1;
-                }
                spin_unlock_irqrestore(&base->lock, flags);
                if (alarm->function)
-                        alarm->function(alarm);
+                        restart = alarm->function(alarm, now);
                spin_lock_irqsave(&base->lock, flags);
+                alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
+                if (restart != ALARMTIMER_NORESTART) {
+                        timerqueue_add(&base->timerqueue, &alarm->node);
+                        alarm->state |= ALARMTIMER_STATE_ENQUEUED;
+                }
        }
        if (next) {
@@ -234,7 +250,7 @@ static int alarmtimer_suspend(struct device *dev)
        freezer_delta = ktime_set(0, 0);
        spin_unlock_irqrestore(&freezer_delta_lock, flags);
-        rtc = rtcdev;
+        rtc = alarmtimer_get_rtcdev();
        /* If we have no rtcdev, just return */
        if (!rtc)
                return 0;
@@ -299,53 +315,111 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
 * @function: callback that is run when the alarm fires
 */
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
-                void (*function)(struct alarm *))
+                enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
 {
        timerqueue_init(&alarm->node);
-        alarm->period = ktime_set(0, 0);
        alarm->function = function;
        alarm->type = type;
-        alarm->enabled = 0;
+        alarm->state = ALARMTIMER_STATE_INACTIVE;
 }
 /**
 * alarm_start - Sets an alarm to fire
 * @alarm: ptr to alarm to set
 * @start: time to run the alarm
- * @period: period at which the alarm will recur
 */
-void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
+void alarm_start(struct alarm *alarm, ktime_t start)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
        spin_lock_irqsave(&base->lock, flags);
-        if (alarm->enabled)
+        if (alarmtimer_active(alarm))
                alarmtimer_remove(base, alarm);
        alarm->node.expires = start;
-        alarm->period = period;
        alarmtimer_enqueue(base, alarm);
-        alarm->enabled = 1;
        spin_unlock_irqrestore(&base->lock, flags);
 }
 /**
- * alarm_cancel - Tries to cancel an alarm timer
+ * alarm_try_to_cancel - Tries to cancel an alarm timer
 * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not running,
+ * and -1 if the callback was running
 */
-void alarm_cancel(struct alarm *alarm)
+int alarm_try_to_cancel(struct alarm *alarm)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
+        int ret = -1;
        spin_lock_irqsave(&base->lock, flags);
-        if (alarm->enabled)
+        if (alarmtimer_callback_running(alarm))
+                goto out;
+        if (alarmtimer_is_queued(alarm)) {
                alarmtimer_remove(base, alarm);
-        alarm->enabled = 0;
+                ret = 1;
+        } else
+                ret = 0;
+out:
        spin_unlock_irqrestore(&base->lock, flags);
+        return ret;
+}
+/**
+ * alarm_cancel - Spins trying to cancel an alarm timer until it is done
+ * @alarm: ptr to alarm to be canceled
+ *
+ * Returns 1 if the timer was canceled, 0 if it was not active.
+ */
+int alarm_cancel(struct alarm *alarm)
+{
+        for (;;) {
+                int ret = alarm_try_to_cancel(alarm);
+                if (ret >= 0)
+                        return ret;
+                cpu_relax();
+        }
+}
+u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
+{
+        u64 overrun = 1;
+        ktime_t delta;
+        delta = ktime_sub(now, alarm->node.expires);
+        if (delta.tv64 < 0)
+                return 0;
+        if (unlikely(delta.tv64 >= interval.tv64)) {
+                s64 incr = ktime_to_ns(interval);
+                overrun = ktime_divns(delta, incr);
+                alarm->node.expires = ktime_add_ns(alarm->node.expires,
+                                                        incr*overrun);
+                if (alarm->node.expires.tv64 > now.tv64)
+                        return overrun;
+                /*
+                 * This (and the ktime_add() below) is the
+                 * correction for exact:
+                 */
+                overrun++;
+        }
+        alarm->node.expires = ktime_add(alarm->node.expires, interval);
+        return overrun;
 }
 /**
 * clock2alarm - helper that converts from clockid to alarmtypes
 * @clockid: clockid.
@@ -365,12 +439,21 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
 *
 * Posix timer callback for expired alarm timers.
 */
-static void alarm_handle_timer(struct alarm *alarm)
+static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
+                                                        ktime_t now)
 {
        struct k_itimer *ptr = container_of(alarm, struct k_itimer,
-                                                it.alarmtimer);
+                                                it.alarm.alarmtimer);
        if (posix_timer_event(ptr, 0) != 0)
                ptr->it_overrun++;
+        /* Re-add periodic timers */
+        if (ptr->it.alarm.interval.tv64) {
+                ptr->it_overrun += alarm_forward(alarm, now,
+                                                ptr->it.alarm.interval);
+                return ALARMTIMER_RESTART;
+        }
+        return ALARMTIMER_NORESTART;
 }
 /**
@@ -427,7 +510,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
        type = clock2alarm(new_timer->it_clock);
        base = &alarm_bases[type];
-        alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
+        alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
        return 0;
 }
@@ -444,9 +527,9 @@ static void alarm_timer_get(struct k_itimer *timr,
        memset(cur_setting, 0, sizeof(struct itimerspec));
        cur_setting->it_interval =
-                        ktime_to_timespec(timr->it.alarmtimer.period);
+                        ktime_to_timespec(timr->it.alarm.interval);
        cur_setting->it_value =
-                        ktime_to_timespec(timr->it.alarmtimer.node.expires);
+                ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
        return;
 }
@@ -461,7 +544,9 @@ static int alarm_timer_del(struct k_itimer *timr)
        if (!rtcdev)
                return -ENOTSUPP;
-        alarm_cancel(&timr->it.alarmtimer);
+        if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+                return TIMER_RETRY;
        return 0;
 }
@@ -481,25 +566,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
        if (!rtcdev)
                return -ENOTSUPP;
-        /*
-         * XXX HACK! Currently we can DOS a system if the interval
-         * period on alarmtimers is too small. Cap the interval here
-         * to 100us and solve this properly in a future patch! -jstultz
-         */
-        if ((new_setting->it_interval.tv_sec == 0) &&
-                        (new_setting->it_interval.tv_nsec < 100000))
-                new_setting->it_interval.tv_nsec = 100000;
        if (old_setting)
                alarm_timer_get(timr, old_setting);
        /* If the timer was already set, cancel it */
-        alarm_cancel(&timr->it.alarmtimer);
+        if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
+                return TIMER_RETRY;
        /* start the timer */
-        alarm_start(&timr->it.alarmtimer,
+        timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-                        timespec_to_ktime(new_setting->it_value),
+        alarm_start(&timr->it.alarm.alarmtimer,
-                        timespec_to_ktime(new_setting->it_interval));
+                        timespec_to_ktime(new_setting->it_value));
        return 0;
 }
@@ -509,13 +586,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 *
 * Wakes up the task that set the alarmtimer
 */
-static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
+static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
+                                                                ktime_t now)
 {
        struct task_struct *task = (struct task_struct *)alarm->data;
        alarm->data = NULL;
        if (task)
                wake_up_process(task);
+        return ALARMTIMER_NORESTART;
 }
 /**
@@ -530,7 +609,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
        alarm->data = (void *)current;
        do {
                set_current_state(TASK_INTERRUPTIBLE);
-                alarm_start(alarm, absexp, ktime_set(0, 0));
+                alarm_start(alarm, absexp);
                if (likely(alarm->data))
                        schedule();
@@ -691,6 +770,7 @@ static struct platform_driver alarmtimer_driver = {
 */
 static int __init alarmtimer_init(void)
 {
+        struct platform_device *pdev;
        int error = 0;
        int i;
        struct k_clock alarm_clock = {
@@ -719,10 +799,26 @@ static int __init alarmtimer_init(void)
                                HRTIMER_MODE_ABS);
                alarm_bases[i].timer.function = alarmtimer_fired;
        }
+        error = alarmtimer_rtc_interface_setup();
+        if (error)
+                return error;
        error = platform_driver_register(&alarmtimer_driver);
-        platform_device_register_simple("alarmtimer", -1, NULL, 0);
+        if (error)
+                goto out_if;
+        pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0);
+        if (IS_ERR(pdev)) {
+                error = PTR_ERR(pdev);
+                goto out_drv;
+        }
+        return 0;
+out_drv:
+        platform_driver_unregister(&alarmtimer_driver);
+out_if:
+        alarmtimer_rtc_interface_remove();
        return error;
 }
 device_initcall(alarmtimer_init);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index e4c699dfa4e8..9cd928f7a7c6 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/sysdev.h>
 #include "tick-internal.h"
@@ -94,42 +93,143 @@ void clockevents_shutdown(struct clock_event_device *dev)
        dev->next_event.tv64 = KTIME_MAX;
 }
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
+/**
+ * clockevents_increase_min_delta - raise minimum delta of a clock event device
+ * @dev:       device to increase the minimum delta
+ *
+ * Returns 0 on success, -ETIME when the minimum delta reached the limit.
+ */
+static int clockevents_increase_min_delta(struct clock_event_device *dev)
+{
+        /* Nothing to do if we already reached the limit */
+        if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
+                printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+                dev->next_event.tv64 = KTIME_MAX;
+                return -ETIME;
+        }
+        if (dev->min_delta_ns < 5000)
+                dev->min_delta_ns = 5000;
+        else
+                dev->min_delta_ns += dev->min_delta_ns >> 1;
+        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+                dev->min_delta_ns = MIN_DELTA_LIMIT;
+        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+               dev->name ? dev->name : "?",
+               (unsigned long long) dev->min_delta_ns);
+        return 0;
+}
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev:        device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+        unsigned long long clc;
+        int64_t delta;
+        int i;
+        for (i = 0;;) {
+                delta = dev->min_delta_ns;
+                dev->next_event = ktime_add_ns(ktime_get(), delta);
+                if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+                        return 0;
+                dev->retries++;
+                clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+                if (dev->set_next_event((unsigned long) clc, dev) == 0)
+                        return 0;
+                if (++i > 2) {
+                        /*
+                         * We tried 3 times to program the device with the
+                         * given min_delta_ns. Try to increase the minimum
+                         * delta, if that fails as well get out of here.
+                         */
+                        if (clockevents_increase_min_delta(dev))
+                                return -ETIME;
+                        i = 0;
+                }
+        }
+}
+#else  /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+/**
+ * clockevents_program_min_delta - Set clock event device to the minimum delay.
+ * @dev:        device to program
+ *
+ * Returns 0 on success, -ETIME when the retry loop failed.
+ */
+static int clockevents_program_min_delta(struct clock_event_device *dev)
+{
+        unsigned long long clc;
+        int64_t delta;
+        delta = dev->min_delta_ns;
+        dev->next_event = ktime_add_ns(ktime_get(), delta);
+        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
+                return 0;
+        dev->retries++;
+        clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+        return dev->set_next_event((unsigned long) clc, dev);
+}
+#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
 /**
 * clockevents_program_event - Reprogram the clock event device.
+ * @dev:        device to program
 * @expires:    absolute expiry time (monotonic clock)
+ * @force:      program minimum delay if expires can not be set
 *
 * Returns 0 on success, -ETIME when the event is in the past.
 */
 int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
-                              ktime_t now)
+                              bool force)
 {
        unsigned long long clc;
        int64_t delta;
+        int rc;
        if (unlikely(expires.tv64 < 0)) {
                WARN_ON_ONCE(1);
                return -ETIME;
        }
-        delta = ktime_to_ns(ktime_sub(expires, now));
-        if (delta <= 0)
-                return -ETIME;
        dev->next_event = expires;
        if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)
                return 0;
-        if (delta > dev->max_delta_ns)
+        /* Shortcut for clockevent devices that can deal with ktime. */
-                delta = dev->max_delta_ns;
+        if (dev->features & CLOCK_EVT_FEAT_KTIME)
-        if (delta < dev->min_delta_ns)
+                return dev->set_next_ktime(expires, dev);
-                delta = dev->min_delta_ns;
+        delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
+        if (delta <= 0)
+                return force ? clockevents_program_min_delta(dev) : -ETIME;
-        clc = delta * dev->mult;
+        delta = min(delta, (int64_t) dev->max_delta_ns);
-        clc >>= dev->shift;
+        delta = max(delta, (int64_t) dev->min_delta_ns);
-        return dev->set_next_event((unsigned long) clc, dev);
+        clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+        rc = dev->set_next_event((unsigned long) clc, dev);
+        return (rc && force) ? clockevents_program_min_delta(dev) : rc;
 }
 /**
@@ -258,7 +358,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
        if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
                return 0;
-        return clockevents_program_event(dev, dev->next_event, ktime_get());
+        return clockevents_program_event(dev, dev->next_event, false);
 }
 /*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e0980f0d9a0a..a45ca167ab24 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
 *   o Allow clocksource drivers to be unregistered
 */
+#include <linux/device.h>
 #include <linux/clocksource.h>
-#include <linux/sysdev.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -186,6 +186,7 @@ static struct timer_list watchdog_timer;
 static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
+static atomic_t watchdog_reset_pending;
 static int clocksource_watchdog_kthread(void *data);
 static void __clocksource_change_rating(struct clocksource *cs, int rating);
@@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data)
        struct clocksource *cs;
        cycle_t csnow, wdnow;
        int64_t wd_nsec, cs_nsec;
-        int next_cpu;
+        int next_cpu, reset_pending;
        spin_lock(&watchdog_lock);
        if (!watchdog_running)
                goto out;
+        reset_pending = atomic_read(&watchdog_reset_pending);
        list_for_each_entry(cs, &watchdog_list, wd_list) {
                /* Clocksource already marked unstable? */
@@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data)
                local_irq_enable();
                /* Clocksource initialized ? */
-                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
+                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
+                    atomic_read(&watchdog_reset_pending)) {
                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
                        cs->wd_last = wdnow;
                        cs->cs_last = csnow;
@@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data)
                cs->cs_last = csnow;
                cs->wd_last = wdnow;
+                if (atomic_read(&watchdog_reset_pending))
+                        continue;
                /* Check the deviation from the watchdog clocksource. */
-                if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+                if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
                        clocksource_unstable(cs, cs_nsec - wd_nsec);
                        continue;
                }
@@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigned long data)
        }
        /*
+         * We only clear the watchdog_reset_pending, when we did a
+         * full cycle through all clocksources.
+         */
+        if (reset_pending)
+                atomic_dec(&watchdog_reset_pending);
+        /*
         * Cycle through CPUs to check if the CPUs stay synchronized
         * to each other.
         */
@@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void)
 static void clocksource_resume_watchdog(void)
 {
-        unsigned long flags;
+        atomic_inc(&watchdog_reset_pending);
-        /*
-         * We use trylock here to avoid a potential dead lock when
-         * kgdb calls this code after the kernel has been stopped with
-         * watchdog_lock held. When watchdog_lock is held we just
-         * return and accept, that the watchdog might trigger and mark
-         * the monitored clock source (usually TSC) unstable.
-         *
-         * This does not affect the other caller clocksource_resume()
-         * because at this point the kernel is UP, interrupts are
-         * disabled and nothing can hold watchdog_lock.
-         */
-        if (!spin_trylock_irqsave(&watchdog_lock, flags))
-                return;
-        clocksource_reset_watchdog();
-        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -494,6 +492,22 @@ void clocksource_touch_watchdog(void)
 }
 /**
+ * clocksource_max_adjustment- Returns max adjustment amount
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u32 clocksource_max_adjustment(struct clocksource *cs)
+{
+        u64 ret;
+        /*
+         * We won't try to correct for more then 11% adjustments (110,000 ppm),
+         */
+        ret = (u64)cs->mult * 11;
+        do_div(ret,100);
+        return (u32)ret;
+}
+/**
 * clocksource_max_deferment - Returns max time the clocksource can be deferred
 * @cs:         Pointer to clocksource
 *
@@ -505,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
        /*
         * Calculate the maximum number of cycles that we can pass to the
         * cyc2ns function without overflowing a 64-bit signed result. The
-         * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+         * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
-         * is equivalent to the below.
+         * which is equivalent to the below.
-         * max_cycles < (2^63)/cs->mult
+         * max_cycles < (2^63)/(cs->mult + cs->maxadj)
-         * max_cycles < 2^(log2((2^63)/cs->mult))
+         * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
-         * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+         * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
-         * max_cycles < 2^(63 - log2(cs->mult))
+         * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
-         * max_cycles < 1 << (63 - log2(cs->mult))
+         * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
         * Please note that we add 1 to the result of the log2 to account for
         * any rounding errors, ensure the above inequality is satisfied and
         * no overflow will occur.
         */
-        max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+        max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
        /*
         * The actual maximum number of cycles we can defer the clocksource is
         * determined by the minimum of max_cycles and cs->mask.
+         * Note: Here we subtract the maxadj to make sure we don't sleep for
+         * too long if there's a large negative adjustment.
         */
        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
-        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
+                                        cs->shift);
        /*
         * To ensure that the clocksource does not wrap whilst we are idle,
@@ -531,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
         * note a margin of 12.5% is used because this can be computed with
         * a shift, versus say 10% which would require division.
         */
-        return max_nsecs - (max_nsecs >> 5);
+        return max_nsecs - (max_nsecs >> 3);
 }
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -630,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 /**
 * __clocksource_updatefreq_scale - Used update clocksource with new freq
- * @t:          clocksource to be registered
+ * @cs:         clocksource to be registered
 * @scale:      Scale factor multiplied against freq to get clocksource hz
 * @freq:       clocksource frequency (cycles per second) divided by scale
 *
@@ -642,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs)
 void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
        u64 sec;
        /*
         * Calc the maximum number of seconds which we can run before
         * wrapping around. For clocksources which have a mask > 32bit
@@ -653,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
         * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
         * margin as we do in clocksource_max_deferment()
         */
-        sec = (cs->mask - (cs->mask >> 5));
+        sec = (cs->mask - (cs->mask >> 3));
        do_div(sec, freq);
        do_div(sec, scale);
        if (!sec)
@@ -663,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
        clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
                               NSEC_PER_SEC / scale, sec * scale);
+        /*
+         * for clocksources that have large mults, to avoid overflow.
+         * Since mult may be adjusted by ntp, add an safety extra margin
+         *
+         */
+        cs->maxadj = clocksource_max_adjustment(cs);
+        while ((cs->mult + cs->maxadj < cs->mult)
+                || (cs->mult - cs->maxadj > cs->mult)) {
+                cs->mult >>= 1;
+                cs->shift--;
+                cs->maxadj = clocksource_max_adjustment(cs);
+        }
        cs->max_idle_ns = clocksource_max_deferment(cs);
 }
 EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 /**
 * __clocksource_register_scale - Used to install new clocksources
- * @t:          clocksource to be registered
+ * @cs:         clocksource to be registered
 * @scale:      Scale factor multiplied against freq to get clocksource hz
 * @freq:       clocksource frequency (cycles per second) divided by scale
 *
@@ -697,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 /**
 * clocksource_register - Used to install new clocksources
- * @t:          clocksource to be registered
+ * @cs:         clocksource to be registered
 *
 * Returns -EBUSY if registration fails, zero otherwise.
 */
 int clocksource_register(struct clocksource *cs)
 {
+        /* calculate max adjustment for given mult/shift */
+        cs->maxadj = clocksource_max_adjustment(cs);
+        WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+                "Clocksource %s might overflow on 11%% adjustment\n",
+                cs->name);
        /* calculate max idle time permitted for this clocksource */
        cs->max_idle_ns = clocksource_max_deferment(cs);
@@ -725,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
 /**
 * clocksource_change_rating - Change the rating of a registered clocksource
+ * @cs:         clocksource to be changed
+ * @rating:     new rating
 */
 void clocksource_change_rating(struct clocksource *cs, int rating)
 {
@@ -736,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating);
 /**
 * clocksource_unregister - remove a registered clocksource
+ * @cs: clocksource to be unregistered
 */
 void clocksource_unregister(struct clocksource *cs)
 {
@@ -751,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister);
 /**
 * sysfs_show_current_clocksources - sysfs interface for current clocksource
 * @dev:        unused
+ * @attr:       unused
 * @buf:        char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing current clocksource.
 */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev,
+sysfs_show_current_clocksources(struct device *dev,
-                                struct sysdev_attribute *attr, char *buf)
+                                struct device_attribute *attr, char *buf)
 {
        ssize_t count = 0;
@@ -771,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 /**
 * sysfs_override_clocksource - interface for manually overriding clocksource
 * @dev:        unused
+ * @attr:       unused
 * @buf:        name of override clocksource
 * @count:      length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
 * clocksource selection.
 */
-static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+static ssize_t sysfs_override_clocksource(struct device *dev,
-                                          struct sysdev_attribute *attr,
+                                          struct device_attribute *attr,
                                          const char *buf, size_t count)
 {
        size_t ret = count;
@@ -806,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
 /**
 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
 * @dev:        unused
+ * @attr:       unused
 * @buf:        char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing registered clocksources
 */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev,
+sysfs_show_available_clocksources(struct device *dev,
-                                  struct sysdev_attribute *attr,
+                                  struct device_attribute *attr,
                                  char *buf)
 {
        struct clocksource *src;
@@ -841,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
 /*
 * Sysfs setup bits:
 */
-static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
                   sysfs_override_clocksource);
-static SYSDEV_ATTR(available_clocksource, 0444,
+static DEVICE_ATTR(available_clocksource, 0444,
                   sysfs_show_available_clocksources, NULL);
-static struct sysdev_class clocksource_sysclass = {
+static struct bus_type clocksource_subsys = {
        .name = "clocksource",
+        .dev_name = "clocksource",
 };
-static struct sys_device device_clocksource = {
+static struct device device_clocksource = {
        .id     = 0,
-        .cls    = &clocksource_sysclass,
+        .bus    = &clocksource_subsys,
 };
 static int __init init_clocksource_sysfs(void)
 {
-        int error = sysdev_class_register(&clocksource_sysclass);
+        int error = subsys_system_register(&clocksource_subsys, NULL);
        if (!error)
-                error = sysdev_register(&device_clocksource);
+                error = device_register(&device_clocksource);
        if (!error)
-                error = sysdev_create_file(
+                error = device_create_file(
                                &device_clocksource,
-                                &attr_current_clocksource);
+                                &dev_attr_current_clocksource);
        if (!error)
-                error = sysdev_create_file(
+                error = device_create_file(
                                &device_clocksource,
-                                &attr_available_clocksource);
+                                &dev_attr_available_clocksource);
        return error;
 }
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index c340ca658f37..ce033c7aa2e8 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -18,6 +18,7 @@
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 #include <linux/device.h>
+#include <linux/export.h>
 #include <linux/file.h>
 #include <linux/posix-clock.h>
 #include <linux/slab.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c7218d132738..fd4a7b1625a2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
             (dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
-        clockevents_exchange_device(NULL, dev);
+        clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
        tick_broadcast_device.evtdev = dev;
        if (!cpumask_empty(tick_get_broadcast_mask()))
                tick_broadcast_start_periodic(dev);
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        for (next = dev->next_event; ;) {
                next = ktime_add(next, tick_period);
-                if (!clockevents_program_event(dev, next, ktime_get()))
+                if (!clockevents_program_event(dev, next, false))
                        return;
                tick_do_periodic_broadcast();
        }
@@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
 {
        struct clock_event_device *bc = tick_broadcast_device.evtdev;
-        return tick_dev_program_event(bc, expires, force);
+        return clockevents_program_event(bc, expires, force);
 }
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 119528de8235..da6c9ecad4e4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
         */
        next = ktime_add(dev->next_event, tick_period);
        for (;;) {
-                if (!clockevents_program_event(dev, next, ktime_get()))
+                if (!clockevents_program_event(dev, next, false))
                        return;
                /*
                 * Have to be careful here. If we're in oneshot mode,
@@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
                for (;;) {
-                        if (!clockevents_program_event(dev, next, ktime_get()))
+                        if (!clockevents_program_event(dev, next, false))
                                return;
                        next = ktime_add(next, tick_period);
                }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 1009b06d6f89..4e265b901fed 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev);
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
                               ktime_t nextevt);
-extern int tick_dev_program_event(struct clock_event_device *dev,
-                                  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2d04411a5f05..824109060a33 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -21,74 +21,6 @@
 #include "tick-internal.h"
-/* Limit min_delta to a jiffie */
-#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
-static int tick_increase_min_delta(struct clock_event_device *dev)
-{
-        /* Nothing to do if we already reached the limit */
-        if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
-                return -ETIME;
-        if (dev->min_delta_ns < 5000)
-                dev->min_delta_ns = 5000;
-        else
-                dev->min_delta_ns += dev->min_delta_ns >> 1;
-        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
-                dev->min_delta_ns = MIN_DELTA_LIMIT;
-        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
-               dev->name ? dev->name : "?",
-               (unsigned long long) dev->min_delta_ns);
-        return 0;
-}
-/**
- * tick_program_event internal worker function
- */
-int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
-                           int force)
-{
-        ktime_t now = ktime_get();
-        int i;
-        for (i = 0;;) {
-                int ret = clockevents_program_event(dev, expires, now);
-                if (!ret || !force)
-                        return ret;
-                dev->retries++;
-                /*
-                 * We tried 3 times to program the device with the given
-                 * min_delta_ns. If that's not working then we increase it
-                 * and emit a warning.
-                 */
-                if (++i > 2) {
-                        /* Increase the min. delta and try again */
-                        if (tick_increase_min_delta(dev)) {
-                                /*
-                                 * Get out of the loop if min_delta_ns
-                                 * hit the limit already. That's
-                                 * better than staying here forever.
-                                 *
-                                 * We clear next_event so we have a
-                                 * chance that the box survives.
-                                 */
-                                printk(KERN_WARNING
-                                       "CE: Reprogramming failure. Giving up\n");
-                                dev->next_event.tv64 = KTIME_MAX;
-                                return -ETIME;
-                        }
-                        i = 0;
-                }
-                now = ktime_get();
-                expires = ktime_add_ns(now, dev->min_delta_ns);
-        }
-}
 /**
 * tick_program_event
 */
@@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force)
 {
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-        return tick_dev_program_event(dev, expires, force);
+        return clockevents_program_event(dev, expires, force);
 }
 /**
@@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force)
 */
 void tick_resume_oneshot(void)
 {
-        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-        struct clock_event_device *dev = td->evtdev;
        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-        tick_program_event(ktime_get(), 1);
+        clockevents_program_event(dev, ktime_get(), true);
 }
 /**
@@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
        newdev->event_handler = handler;
        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-        tick_dev_program_event(newdev, next_event, 1);
+        clockevents_program_event(newdev, next_event, true);
 }
 /**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c44b407..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-        cpumask_clear_cpu(cpu, nohz_cpu_mask);
        ts->idle_waketime = now;
        local_irq_save(flags);
@@ -159,9 +158,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
        if (ts->idle_active) {
                delta = ktime_sub(now, ts->idle_entrytime);
-                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
                if (nr_iowait_cpu(cpu) > 0)
                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+                else
+                        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
                ts->idle_entrytime = now;
        }
@@ -197,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 /**
 * get_cpu_idle_time_us - get the total idle time of a cpu
 * @cpu: CPU number to query
- * @last_update_time: variable to store update time in
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
 *
 * Return the cummulative idle time (since boot) for a given
- * CPU, in microseconds. The idle time returned includes
+ * CPU, in microseconds.
- * the iowait time (unlike what "top" and co report).
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
@@ -211,20 +211,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t now, idle;
        if (!tick_nohz_enabled)
                return -1;
-        update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+        now = ktime_get();
+        if (last_update_time) {
+                update_ts_time_stats(cpu, ts, now, last_update_time);
+                idle = ts->idle_sleeptime;
+        } else {
+                if (ts->idle_active && !nr_iowait_cpu(cpu)) {
+                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+                        idle = ktime_add(ts->idle_sleeptime, delta);
+                } else {
+                        idle = ts->idle_sleeptime;
+                }
+        }
+        return ktime_to_us(idle);
-        return ktime_to_us(ts->idle_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
-/*
+/**
 * get_cpu_iowait_time_us - get the total iowait time of a cpu
 * @cpu: CPU number to query
- * @last_update_time: variable to store update time in
+ * @last_update_time: variable to store update time in. Do not update
+ * counters if NULL.
 *
 * Return the cummulative iowait time (since boot) for a given
 * CPU, in microseconds.
@@ -237,52 +252,40 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t now, iowait;
        if (!tick_nohz_enabled)
                return -1;
-        update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+        now = ktime_get();
+        if (last_update_time) {
+                update_ts_time_stats(cpu, ts, now, last_update_time);
+                iowait = ts->iowait_sleeptime;
+        } else {
+                if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
+                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-        return ktime_to_us(ts->iowait_sleeptime);
+                        iowait = ktime_add(ts->iowait_sleeptime, delta);
+                } else {
+                        iowait = ts->iowait_sleeptime;
+                }
+        }
+        return ktime_to_us(iowait);
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-/**
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
- * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- * Called either from the idle loop or from irq_exit() when an idle period was
- * just interrupted by an interrupt which did not cause a reschedule.
- */
-void tick_nohz_stop_sched_tick(int inidle)
 {
-        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-        struct tick_sched *ts;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
        int cpu;
-        local_irq_save(flags);
        cpu = smp_processor_id();
        ts = &per_cpu(tick_cpu_sched, cpu);
-        /*
-         * Call to tick_nohz_start_idle stops the last_update_time from being
-         * updated. Thus, it must not be called in the event we are called from
-         * irq_exit() with the prior state different than idle.
-         */
-        if (!inidle && !ts->inidle)
-                goto end;
-        /*
-         * Set ts->inidle unconditionally. Even if the system did not
-         * switch to NOHZ mode the cpu frequency governers rely on the
-         * update of the idle time accounting in tick_nohz_start_idle().
-         */
-        ts->inidle = 1;
        now = tick_nohz_start_idle(cpu, ts);
        /*
@@ -298,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
        }
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
-                goto end;
+                return;
        if (need_resched())
-                goto end;
+                return;
        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
                static int ratelimit;
@@ -311,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                               (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
-                goto end;
+                return;
        }
        ts->idle_calls++;
@@ -389,9 +392,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                else
                        expires.tv64 = KTIME_MAX;
-                if (delta_jiffies > 1)
-                        cpumask_set_cpu(cpu, nohz_cpu_mask);
                /* Skip reprogram of event if its not changed */
                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
                        goto out;
@@ -409,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
-                        rcu_enter_nohz();
                }
                ts->idle_sleeps++;
@@ -441,15 +440,70 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * softirq.
                 */
                tick_do_update_jiffies64(ktime_get());
-                cpumask_clear_cpu(cpu, nohz_cpu_mask);
        }
        raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
        ts->next_jiffies = next_jiffies;
        ts->last_jiffies = last_jiffies;
        ts->sleep_length = ktime_sub(dev->next_event, now);
-end:
+}
-        local_irq_restore(flags);
+/**
+ * tick_nohz_idle_enter - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called when we start the idle loop.
+ *
+ * The arch is responsible of calling:
+ *
+ * - rcu_idle_enter() after its last use of RCU before the CPU is put
+ *  to sleep.
+ * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ */
+void tick_nohz_idle_enter(void)
+{
+        struct tick_sched *ts;
+        WARN_ON_ONCE(irqs_disabled());
+        /*
+         * Update the idle state in the scheduler domain hierarchy
+         * when tick_nohz_stop_sched_tick() is called from the idle loop.
+         * State will be updated to busy during the first busy tick after
+         * exiting idle.
+         */
+        set_cpu_sd_state_idle();
+        local_irq_disable();
+        ts = &__get_cpu_var(tick_cpu_sched);
+        /*
+         * set ts->inidle unconditionally. even if the system did not
+         * switch to nohz mode the cpu frequency governers rely on the
+         * update of the idle time accounting in tick_nohz_start_idle().
+         */
+        ts->inidle = 1;
+        tick_nohz_stop_sched_tick(ts);
+        local_irq_enable();
+}
+/**
+ * tick_nohz_irq_exit - update next tick event from interrupt exit
+ *
+ * When an interrupt fires while we are idle and it doesn't cause
+ * a reschedule, it may still add, modify or delete a timer, enqueue
+ * an RCU callback, etc...
+ * So we need to re-calculate and reprogram the next tick event.
+ */
+void tick_nohz_irq_exit(void)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        if (!ts->inidle)
+                return;
+        tick_nohz_stop_sched_tick(ts);
 }
 /**
@@ -491,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 }
 /**
- * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
 *
 * Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
 */
-void tick_nohz_restart_sched_tick(void)
+void tick_nohz_idle_exit(void)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -505,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
        local_irq_disable();
        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
                now = ktime_get();
@@ -519,12 +576,9 @@ void tick_nohz_restart_sched_tick(void)
        ts->inidle = 0;
-        rcu_exit_nohz();
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
-        cpumask_clear_cpu(cpu, nohz_cpu_mask);
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
        /*
@@ -640,8 +694,6 @@ static void tick_nohz_switch_to_nohz(void)
                next = ktime_add(next, tick_period);
        }
        local_irq_enable();
-        printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
 }
 /*
@@ -793,10 +845,8 @@ void tick_setup_sched_timer(void)
        }
 #ifdef CONFIG_NO_HZ
-        if (tick_nohz_enabled) {
+        if (tick_nohz_enabled)
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
-                printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
-        }
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2b021b0e8507..0c6358186401 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        /* return delta convert to nanoseconds using ntp adjusted mult. */
+        /* return delta convert to nanoseconds. */
        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
@@ -249,6 +249,8 @@ ktime_t ktime_get(void)
                secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
                nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
                nsecs += timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&xtime_lock, seq));
        /*
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts)
                *ts = xtime;
                tomono = wall_to_monotonic;
                nsecs = timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
        } while (read_seqretry(&xtime_lock, seq));
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset)
        s64 error, interval = timekeeper.cycle_interval;
        int adj;
+        /*
+         * The point of this is to check if the error is greater then half
+         * an interval.
+         *
+         * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
+         *
+         * Note we subtract one in the shift, so that error is really error*2.
+         * This "saves" dividing(shifting) interval twice, but keeps the
+         * (error > interval) comparison as still measuring if error is
+         * larger then half an interval.
+         *
+         * Note: It does not "save" on aggravation when reading the code.
+         */
        error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
        if (error > interval) {
+                /*
+                 * We now divide error by 4(via shift), which checks if
+                 * the error is greater then twice the interval.
+                 * If it is greater, we need a bigadjust, if its smaller,
+                 * we can adjust by 1.
+                 */
                error >>= 2;
+                /*
+                 * XXX - In update_wall_time, we round up to the next
+                 * nanosecond, and store the amount rounded up into
+                 * the error. This causes the likely below to be unlikely.
+                 *
+                 * The proper fix is to avoid rounding up by using
+                 * the high precision timekeeper.xtime_nsec instead of
+                 * xtime.tv_nsec everywhere. Fixing this will take some
+                 * time.
+                 */
                if (likely(error <= interval))
                        adj = 1;
                else
                        adj = timekeeping_bigadjust(error, &interval, &offset);
        } else if (error < -interval) {
+                /* See comment above, this is just switched for the negative */
                error >>= 2;
                if (likely(error >= -interval)) {
                        adj = -1;
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset)
                        offset = -offset;
                } else
                        adj = timekeeping_bigadjust(error, &interval, &offset);
-        } else
+        } else /* No adjustment needed */
                return;
+        WARN_ONCE(timekeeper.clock->maxadj &&
+                        (timekeeper.mult + adj > timekeeper.clock->mult +
+                                                timekeeper.clock->maxadj),
+                        "Adjusting %s more then 11%% (%ld vs %ld)\n",
+                        timekeeper.clock->name, (long)timekeeper.mult + adj,
+                        (long)timekeeper.clock->mult +
+                                timekeeper.clock->maxadj);
+        /*
+         * So the following can be confusing.
+         *
+         * To keep things simple, lets assume adj == 1 for now.
+         *
+         * When adj != 1, remember that the interval and offset values
+         * have been appropriately scaled so the math is the same.
+         *
+         * The basic idea here is that we're increasing the multiplier
+         * by one, this causes the xtime_interval to be incremented by
+         * one cycle_interval. This is because:
+         *      xtime_interval = cycle_interval * mult
+         * So if mult is being incremented by one:
+         *      xtime_interval = cycle_interval * (mult + 1)
+         * Its the same as:
+         *      xtime_interval = (cycle_interval * mult) + cycle_interval
+         * Which can be shortened to:
+         *      xtime_interval += cycle_interval
+         *
+         * So offset stores the non-accumulated cycles. Thus the current
+         * time (in shifted nanoseconds) is:
+         *      now = (offset * adj) + xtime_nsec
+         * Now, even though we're adjusting the clock frequency, we have
+         * to keep time consistent. In other words, we can't jump back
+         * in time, and we also want to avoid jumping forward in time.
+         *
+         * So given the same offset value, we need the time to be the same
+         * both before and after the freq adjustment.
+         *      now = (offset * adj_1) + xtime_nsec_1
+         *      now = (offset * adj_2) + xtime_nsec_2
+         * So:
+         *      (offset * adj_1) + xtime_nsec_1 =
+         *              (offset * adj_2) + xtime_nsec_2
+         * And we know:
+         *      adj_2 = adj_1 + 1
+         * So:
+         *      (offset * adj_1) + xtime_nsec_1 =
+         *              (offset * (adj_1+1)) + xtime_nsec_2
+         *      (offset * adj_1) + xtime_nsec_1 =
+         *              (offset * adj_1) + offset + xtime_nsec_2
+         * Canceling the sides:
+         *      xtime_nsec_1 = offset + xtime_nsec_2
+         * Which gives us:
+         *      xtime_nsec_2 = xtime_nsec_1 - offset
+         * Which simplfies to:
+         *      xtime_nsec -= offset
+         *
+         * XXX - TODO: Doc ntp_error calculation.
+         */
        timekeeper.mult += adj;
        timekeeper.xtime_interval += interval;
        timekeeper.xtime_nsec -= offset;
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index a5d0a3a85dd8..0b537f27b559 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
 /*
 * Spinlock protecting the tables - not taken during lookup:
 */
-static DEFINE_SPINLOCK(table_lock);
+static DEFINE_RAW_SPINLOCK(table_lock);
 /*
 * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
        prev = NULL;
        curr = *head;
-        spin_lock(&table_lock);
+        raw_spin_lock(&table_lock);
        /*
         * Make sure we have not raced with another CPU:
         */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
                        *head = curr;
        }
 out_unlock:
-        spin_unlock(&table_lock);
+        raw_spin_unlock(&table_lock);
        return curr;
 }
diff --git a/kernel/timer.c b/kernel/timer.c
index 8cff36119e4d..a297ffcf888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -20,7 +20,7 @@
 */
 #include <linux/kernel_stat.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
        }
 }
+/* Stub timer callback for improperly used timers. */
+static void stub_timer(unsigned long data)
+{
+        WARN_ON(1);
+}
 /*
 * fixup_activate is called when:
 * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
                        debug_object_activate(timer, &timer_debug_descr);
                        return 0;
                } else {
-                        WARN_ON_ONCE(1);
+                        setup_timer(timer, stub_timer, 0);
+                        return 1;
                }
                return 0;
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
        }
 }
+/*
+ * fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+        struct timer_list *timer = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+                        /*
+                         * This is not really a fixup. The timer was
+                         * statically initialized. We just make sure that it
+                         * is tracked in the object tracker.
+                         */
+                        debug_object_init(timer, &timer_debug_descr);
+                        return 0;
+                } else {
+                        setup_timer(timer, stub_timer, 0);
+                        return 1;
+                }
+        default:
+                return 0;
+        }
+}
 static struct debug_obj_descr timer_debug_descr = {
-        .name           = "timer_list",
+        .name                   = "timer_list",
-        .debug_hint     = timer_debug_hint,
+        .debug_hint             = timer_debug_hint,
-        .fixup_init     = timer_fixup_init,
+        .fixup_init             = timer_fixup_init,
-        .fixup_activate = timer_fixup_activate,
+        .fixup_activate         = timer_fixup_activate,
-        .fixup_free     = timer_fixup_free,
+        .fixup_free             = timer_fixup_free,
+        .fixup_assert_init      = timer_fixup_assert_init,
 };
 static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
        debug_object_free(timer, &timer_debug_descr);
 }
+static inline void debug_timer_assert_init(struct timer_list *timer)
+{
+        debug_object_assert_init(timer, &timer_debug_descr);
+}
 static void __init_timer(struct timer_list *timer,
                         const char *name,
                         struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
 static inline void debug_timer_init(struct timer_list *timer) { }
 static inline void debug_timer_activate(struct timer_list *timer) { }
 static inline void debug_timer_deactivate(struct timer_list *timer) { }
+static inline void debug_timer_assert_init(struct timer_list *timer) { }
 #endif
 static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
        trace_timer_cancel(timer);
 }
+static inline void debug_assert_init(struct timer_list *timer)
+{
+        debug_timer_assert_init(timer);
+}
 static void __init_timer(struct timer_list *timer,
                         const char *name,
                         struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
        unsigned long flags;
        int ret = 0;
+        debug_assert_init(timer);
        timer_stats_timer_clear_start_info(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
        unsigned long flags;
        int ret = -1;
+        debug_assert_init(timer);
        base = lock_timer_base(timer, &flags);
        if (base->running_timer == timer)
@@ -1368,7 +1418,7 @@ SYSCALL_DEFINE0(getppid)
        int pid;
        rcu_read_lock();
-        pid = task_tgid_vnr(current->real_parent);
+        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();
        return pid;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c5..5f39a07fe5ea 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
+CFLAGS_trace_events_filter.o := -I$(src)
 #
 # Make the trace clocks available generally: it's infrastructure
 # relied on by ptrace for example:
@@ -53,6 +55,9 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+ifeq ($(CONFIG_PM_RUNTIME),y)
+obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
+endif
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7c910a5593a6..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
+#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
@@ -401,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
 static struct dentry *blk_create_buf_file_callback(const char *filename,
                                                   struct dentry *parent,
-                                                   int mode,
+                                                   umode_t mode,
                                                   struct rchan_buf *buf,
                                                   int *is_global)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575e7829..b1e8943fed1d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,6 +22,7 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
@@ -151,7 +152,6 @@ void clear_ftrace_function(void)
        ftrace_pid_function = ftrace_stub;
 }
-#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 /*
 * For those archs that do not test ftrace_trace_stop in their
@@ -1211,7 +1211,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        if (!src->count) {
                free_ftrace_hash_rcu(*dst);
                rcu_assign_pointer(*dst, EMPTY_HASH);
-                return 0;
+                /* still need to update the function records */
+                ret = 0;
+                goto out;
        }
        /*
@@ -3863,6 +3865,14 @@ void ftrace_kill(void)
 }
 /**
+ * Test if ftrace is dead or not.
+ */
+int ftrace_is_dead(void)
+{
+        return ftrace_disabled;
+}
+/**
 * register_ftrace_function - register a function for profiling
 * @ops - ops structure that holds the function for profiling.
 *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4acc..f5b7b5c1195b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu {
        int                             cpu;
        atomic_t                        record_disabled;
        struct ring_buffer              *buffer;
-        spinlock_t                      reader_lock;    /* serialize readers */
+        raw_spinlock_t                  reader_lock;    /* serialize readers */
        arch_spinlock_t                 lock;
        struct lock_class_key           lock_key;
        struct list_head                *pages;
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *reader_page;
        unsigned long                   lost_events;
        unsigned long                   last_overrun;
+        local_t                         entries_bytes;
        local_t                         commit_overrun;
        local_t                         overrun;
        local_t                         entries;
        local_t                         committing;
        local_t                         commits;
        unsigned long                   read;
+        unsigned long                   read_bytes;
        u64                             write_stamp;
        u64                             read_stamp;
 };
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        cpu_buffer->cpu = cpu;
        cpu_buffer->buffer = buffer;
-        spin_lock_init(&cpu_buffer->reader_lock);
+        raw_spin_lock_init(&cpu_buffer->reader_lock);
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        struct list_head *p;
        unsigned i;
-        spin_lock_irq(&cpu_buffer->reader_lock);
+        raw_spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
        for (i = 0; i < nr_pages; i++) {
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        rb_check_pages(cpu_buffer);
 out:
-        spin_unlock_irq(&cpu_buffer->reader_lock);
+        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 static void
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        struct list_head *p;
        unsigned i;
-        spin_lock_irq(&cpu_buffer->reader_lock);
+        raw_spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
        for (i = 0; i < nr_pages; i++) {
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        rb_check_pages(cpu_buffer);
 out:
-        spin_unlock_irq(&cpu_buffer->reader_lock);
+        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 /**
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
                 * the counters.
                 */
                local_add(entries, &cpu_buffer->overrun);
+                local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
                /*
                 * The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
+        /* account for padding bytes */
+        local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
        /*
         * Save the original length to the meta data.
         * This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        if (!tail)
                tail_page->page->time_stamp = ts;
+        /* account for these added bytes */
+        local_add(length, &cpu_buffer->entries_bytes);
        return event;
 }
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
                unsigned long write_mask =
                        local_read(&bpage->write) & ~RB_WRITE_MASK;
+                unsigned long event_length = rb_event_length(event);
                /*
                 * This is on the tail page. It is possible that
                 * a write could come in and move the tail page
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
                old_index += write_mask;
                new_index += write_mask;
                index = local_cmpxchg(&bpage->write, old_index, new_index);
-                if (index == old_index)
+                if (index == old_index) {
+                        /* update counters */
+                        local_sub(event_length, &cpu_buffer->entries_bytes);
                        return 1;
+                }
        }
        /* could not discard */
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
 }
 /**
+ * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+{
+        unsigned long flags;
+        struct ring_buffer_per_cpu *cpu_buffer;
+        struct buffer_page *bpage;
+        unsigned long ret;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        /*
+         * if the tail is on reader_page, oldest time stamp is on the reader
+         * page
+         */
+        if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+                bpage = cpu_buffer->reader_page;
+        else
+                bpage = rb_set_head_page(cpu_buffer);
+        ret = bpage->page->time_stamp;
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
+/**
+ * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long ret;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
+/**
 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to get the entries from.
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
        cpu_buffer = iter->cpu_buffer;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        rb_iter_reset(iter);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
 again:
        local_irq_save(flags);
        if (dolock)
-                spin_lock(&cpu_buffer->reader_lock);
+                raw_spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                rb_advance_reader(cpu_buffer);
        if (dolock)
-                spin_unlock(&cpu_buffer->reader_lock);
+                raw_spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        unsigned long flags;
 again:
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        event = rb_iter_peek(iter, ts);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                goto again;
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
        cpu_buffer = buffer->buffers[cpu];
        local_irq_save(flags);
        if (dolock)
-                spin_lock(&cpu_buffer->reader_lock);
+                raw_spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event) {
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
        }
        if (dolock)
-                spin_unlock(&cpu_buffer->reader_lock);
+                raw_spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
 out:
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
        cpu_buffer = iter->cpu_buffer;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
        arch_spin_unlock(&cpu_buffer->lock);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
        unsigned long flags;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 again:
        event = rb_iter_peek(iter, ts);
        if (!event)
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
        rb_advance_iter(iter);
 out:
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        return event;
 }
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page->read = 0;
        local_set(&cpu_buffer->commit_overrun, 0);
+        local_set(&cpu_buffer->entries_bytes, 0);
        local_set(&cpu_buffer->overrun, 0);
        local_set(&cpu_buffer->entries, 0);
        local_set(&cpu_buffer->committing, 0);
        local_set(&cpu_buffer->commits, 0);
        cpu_buffer->read = 0;
+        cpu_buffer->read_bytes = 0;
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        atomic_inc(&cpu_buffer->record_disabled);
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
                goto out;
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        arch_spin_unlock(&cpu_buffer->lock);
 out:
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        atomic_dec(&cpu_buffer->record_disabled);
 }
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
                cpu_buffer = buffer->buffers[cpu];
                local_irq_save(flags);
                if (dolock)
-                        spin_lock(&cpu_buffer->reader_lock);
+                        raw_spin_lock(&cpu_buffer->reader_lock);
                ret = rb_per_cpu_empty(cpu_buffer);
                if (dolock)
-                        spin_unlock(&cpu_buffer->reader_lock);
+                        raw_spin_unlock(&cpu_buffer->reader_lock);
                local_irq_restore(flags);
                if (!ret)
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
        cpu_buffer = buffer->buffers[cpu];
        local_irq_save(flags);
        if (dolock)
-                spin_lock(&cpu_buffer->reader_lock);
+                raw_spin_lock(&cpu_buffer->reader_lock);
        ret = rb_per_cpu_empty(cpu_buffer);
        if (dolock)
-                spin_unlock(&cpu_buffer->reader_lock);
+                raw_spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
        return ret;
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        if (!bpage)
                goto out;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        reader = rb_get_reader_page(cpu_buffer);
        if (!reader)
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        } else {
                /* update the entry counter */
                cpu_buffer->read += rb_page_entries(reader);
+                cpu_buffer->read_bytes += BUF_PAGE_SIZE;
                /* swap the pages */
                rb_init_page(bpage);
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
 out_unlock:
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 out:
        return ret;
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 000000000000..4b3b5eaf94d1
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
+ */
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpm.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1d..a3f1bc5d2a00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,10 +338,11 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
+        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
+        TRACE_ITER_IRQ_INFO;
 static int trace_stop_count;
-static DEFINE_SPINLOCK(tracing_start_lock);
+static DEFINE_RAW_SPINLOCK(tracing_start_lock);
 static void wakeup_work_handler(struct work_struct *work)
 {
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
        "record-cmd",
        "overwrite",
        "disable_on_free",
+        "irq-info",
        NULL
 };
@@ -435,6 +437,7 @@ static struct {
 } trace_clocks[] = {
        { trace_clock_local,    "local" },
        { trace_clock_global,   "global" },
+        { trace_clock_counter,  "counter" },
 };
 int trace_clock_id;
@@ -960,7 +963,7 @@ void tracing_start(void)
        if (tracing_disabled)
                return;
-        spin_lock_irqsave(&tracing_start_lock, flags);
+        raw_spin_lock_irqsave(&tracing_start_lock, flags);
        if (--trace_stop_count) {
                if (trace_stop_count < 0) {
                        /* Someone screwed up their debugging */
@@ -985,7 +988,7 @@ void tracing_start(void)
        ftrace_start();
 out:
-        spin_unlock_irqrestore(&tracing_start_lock, flags);
+        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 /**
@@ -1000,7 +1003,7 @@ void tracing_stop(void)
        unsigned long flags;
        ftrace_stop();
-        spin_lock_irqsave(&tracing_start_lock, flags);
+        raw_spin_lock_irqsave(&tracing_start_lock, flags);
        if (trace_stop_count++)
                goto out;
@@ -1018,7 +1021,7 @@ void tracing_stop(void)
        arch_spin_unlock(&ftrace_max_lock);
 out:
-        spin_unlock_irqrestore(&tracing_start_lock, flags);
+        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 void trace_stop_cmdline_recording(void);
@@ -1842,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
        trace_event_read_unlock();
 }
+static void
+get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+{
+        unsigned long count;
+        int cpu;
+        *total = 0;
+        *entries = 0;
+        for_each_tracing_cpu(cpu) {
+                count = ring_buffer_entries_cpu(tr->buffer, cpu);
+                /*
+                 * If this buffer has skipped entries, then we hold all
+                 * entries for the trace and we need to ignore the
+                 * ones before the time stamp.
+                 */
+                if (tr->data[cpu]->skipped_entries) {
+                        count -= tr->data[cpu]->skipped_entries;
+                        /* total is the same as the entries */
+                        *total += count;
+                } else
+                        *total += count +
+                                ring_buffer_overrun_cpu(tr->buffer, cpu);
+                *entries += count;
+        }
+}
 static void print_lat_help_header(struct seq_file *m)
 {
        seq_puts(m, "#                  _------=> CPU#            \n");
@@ -1854,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
-static void print_func_help_header(struct seq_file *m)
+static void print_event_info(struct trace_array *tr, struct seq_file *m)
+{
+        unsigned long total;
+        unsigned long entries;
+        get_total_entries(tr, &total, &entries);
+        seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
+                   entries, total, num_online_cpus());
+        seq_puts(m, "#\n");
+}
+static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
 {
-        seq_puts(m, "#           TASK-PID    CPU#    TIMESTAMP  FUNCTION\n");
+        print_event_info(tr, m);
+        seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
        seq_puts(m, "#              | |       |          |         |\n");
 }
+static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+{
+        print_event_info(tr, m);
+        seq_puts(m, "#                              _-----=> irqs-off\n");
+        seq_puts(m, "#                             / _----=> need-resched\n");
+        seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
+        seq_puts(m, "#                            || / _--=> preempt-depth\n");
+        seq_puts(m, "#                            ||| /     delay\n");
+        seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
+        seq_puts(m, "#              | |       |   ||||       |         |\n");
+}
 void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1868,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        struct trace_array *tr = iter->tr;
        struct trace_array_cpu *data = tr->data[tr->cpu];
        struct tracer *type = current_trace;
-        unsigned long entries = 0;
+        unsigned long entries;
-        unsigned long total = 0;
+        unsigned long total;
-        unsigned long count;
        const char *name = "preemption";
-        int cpu;
        if (type)
                name = type->name;
+        get_total_entries(tr, &total, &entries);
-        for_each_tracing_cpu(cpu) {
-                count = ring_buffer_entries_cpu(tr->buffer, cpu);
-                /*
-                 * If this buffer has skipped entries, then we hold all
-                 * entries for the trace and we need to ignore the
-                 * ones before the time stamp.
-                 */
-                if (tr->data[cpu]->skipped_entries) {
-                        count -= tr->data[cpu]->skipped_entries;
-                        /* total is the same as the entries */
-                        total += count;
-                } else
-                        total += count +
-                                ring_buffer_overrun_cpu(tr->buffer, cpu);
-                entries += count;
-        }
        seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                   name, UTS_RELEASE);
@@ -2139,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
        return print_trace_fmt(iter);
 }
+void trace_latency_header(struct seq_file *m)
+{
+        struct trace_iterator *iter = m->private;
+        /* print nothing if the buffers are empty */
+        if (trace_empty(iter))
+                return;
+        if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+                print_trace_header(m, iter);
+        if (!(trace_flags & TRACE_ITER_VERBOSE))
+                print_lat_help_header(m);
+}
 void trace_default_header(struct seq_file *m)
 {
        struct trace_iterator *iter = m->private;
@@ -2154,11 +2204,23 @@ void trace_default_header(struct seq_file *m)
                if (!(trace_flags & TRACE_ITER_VERBOSE))
                        print_lat_help_header(m);
        } else {
-                if (!(trace_flags & TRACE_ITER_VERBOSE))
+                if (!(trace_flags & TRACE_ITER_VERBOSE)) {
-                        print_func_help_header(m);
+                        if (trace_flags & TRACE_ITER_IRQ_INFO)
+                                print_func_help_header_irq(iter->tr, m);
+                        else
+                                print_func_help_header(iter->tr, m);
+                }
        }
 }
+static void test_ftrace_alive(struct seq_file *m)
+{
+        if (!ftrace_is_dead())
+                return;
+        seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+        seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
+}
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -2168,6 +2230,7 @@ static int s_show(struct seq_file *m, void *v)
                if (iter->tr) {
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
+                        test_ftrace_alive(m);
                }
                if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
@@ -2710,9 +2773,9 @@ static const char readme_msg[] =
        "# cat /sys/kernel/debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-        "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+        "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
        "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-        "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
+        "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
 ;
 static ssize_t
@@ -3569,6 +3632,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 }
 static ssize_t
+tracing_total_entries_read(struct file *filp, char __user *ubuf,
+                                size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = filp->private_data;
+        char buf[64];
+        int r, cpu;
+        unsigned long size = 0, expanded_size = 0;
+        mutex_lock(&trace_types_lock);
+        for_each_tracing_cpu(cpu) {
+                size += tr->entries >> 10;
+                if (!ring_buffer_expanded)
+                        expanded_size += trace_buf_size >> 10;
+        }
+        if (ring_buffer_expanded)
+                r = sprintf(buf, "%lu\n", size);
+        else
+                r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
+        mutex_unlock(&trace_types_lock);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
 tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
                          size_t cnt, loff_t *ppos)
 {
@@ -3594,22 +3681,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
        return 0;
 }
-static int mark_printk(const char *fmt, ...)
-{
-        int ret;
-        va_list args;
-        va_start(args, fmt);
-        ret = trace_vprintk(0, fmt, args);
-        va_end(args);
-        return ret;
-}
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
 {
-        char *buf;
+        unsigned long addr = (unsigned long)ubuf;
-        size_t written;
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        struct print_entry *entry;
+        unsigned long irq_flags;
+        struct page *pages[2];
+        int nr_pages = 1;
+        ssize_t written;
+        void *page1;
+        void *page2;
+        int offset;
+        int size;
+        int len;
+        int ret;
        if (tracing_disabled)
                return -EINVAL;
@@ -3617,28 +3706,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;
-        buf = kmalloc(cnt + 2, GFP_KERNEL);
+        /*
-        if (buf == NULL)
+         * Userspace is injecting traces into the kernel trace buffer.
-                return -ENOMEM;
+         * We want to be as non intrusive as possible.
+         * To do so, we do not want to allocate any special buffers
+         * or take any locks, but instead write the userspace data
+         * straight into the ring buffer.
+         *
+         * First we need to pin the userspace buffer into memory,
+         * which, most likely it is, because it just referenced it.
+         * But there's no guarantee that it is. By using get_user_pages_fast()
+         * and kmap_atomic/kunmap_atomic() we can get access to the
+         * pages directly. We then write the data directly into the
+         * ring buffer.
+         */
+        BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-        if (copy_from_user(buf, ubuf, cnt)) {
+        /* check if we cross pages */
-                kfree(buf);
+        if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
-                return -EFAULT;
+                nr_pages = 2;
+        offset = addr & (PAGE_SIZE - 1);
+        addr &= PAGE_MASK;
+        ret = get_user_pages_fast(addr, nr_pages, 0, pages);
+        if (ret < nr_pages) {
+                while (--ret >= 0)
+                        put_page(pages[ret]);
+                written = -EFAULT;
+                goto out;
        }
-        if (buf[cnt-1] != '\n') {
-                buf[cnt] = '\n';
+        page1 = kmap_atomic(pages[0]);
-                buf[cnt+1] = '\0';
+        if (nr_pages == 2)
+                page2 = kmap_atomic(pages[1]);
+        local_save_flags(irq_flags);
+        size = sizeof(*entry) + cnt + 2; /* possible \n added */
+        buffer = global_trace.buffer;
+        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+                                          irq_flags, preempt_count());
+        if (!event) {
+                /* Ring buffer disabled, return as if not open for write */
+                written = -EBADF;
+                goto out_unlock;
+        }
+        entry = ring_buffer_event_data(event);
+        entry->ip = _THIS_IP_;
+        if (nr_pages == 2) {
+                len = PAGE_SIZE - offset;
+                memcpy(&entry->buf, page1 + offset, len);
+                memcpy(&entry->buf[len], page2, cnt - len);
        } else
-                buf[cnt] = '\0';
+                memcpy(&entry->buf, page1 + offset, cnt);
-        written = mark_printk("%s", buf);
+        if (entry->buf[cnt - 1] != '\n') {
-        kfree(buf);
+                entry->buf[cnt] = '\n';
-        *fpos += written;
+                entry->buf[cnt + 1] = '\0';
+        } else
+                entry->buf[cnt] = '\0';
+        ring_buffer_unlock_commit(buffer, event);
+        written = cnt;
-        /* don't tell userspace we wrote more - it might confuse them */
+        *fpos += written;
-        if (written > cnt)
-                written = cnt;
+ out_unlock:
+        if (nr_pages == 2)
+                kunmap_atomic(page2);
+        kunmap_atomic(page1);
+        while (nr_pages > 0)
+                put_page(pages[--nr_pages]);
+ out:
        return written;
 }
@@ -3739,6 +3881,12 @@ static const struct file_operations tracing_entries_fops = {
        .llseek         = generic_file_llseek,
 };
+static const struct file_operations tracing_total_entries_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_total_entries_read,
+        .llseek         = generic_file_llseek,
+};
 static const struct file_operations tracing_free_buffer_fops = {
        .write          = tracing_free_buffer_write,
        .release        = tracing_free_buffer_release,
@@ -3808,8 +3956,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (info->read < PAGE_SIZE)
                goto read;
-        info->read = 0;
        trace_access_lock(info->cpu);
        ret = ring_buffer_read_page(info->tr->buffer,
                                    &info->spare,
@@ -3819,6 +3965,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (ret < 0)
                return 0;
+        info->read = 0;
 read:
        size = PAGE_SIZE - info->read;
        if (size > count)
@@ -4026,6 +4174,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        struct trace_array *tr = &global_trace;
        struct trace_seq *s;
        unsigned long cnt;
+        unsigned long long t;
+        unsigned long usec_rem;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
@@ -4042,6 +4192,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
        trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+        cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "bytes: %ld\n", cnt);
+        t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+        usec_rem = do_div(t, USEC_PER_SEC);
+        trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
+        t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+        usec_rem = do_div(t, USEC_PER_SEC);
+        trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
        kfree(s);
@@ -4277,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {
 };
 struct dentry *trace_create_file(const char *name,
-                                 mode_t mode,
+                                 umode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops)
@@ -4450,6 +4611,9 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("buffer_size_kb", 0644, d_tracer,
                        &global_trace, &tracing_entries_fops);
+        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+                        &global_trace, &tracing_total_entries_fops);
        trace_create_file("free_buffer", 0644, d_tracer,
                        &global_trace, &tracing_free_buffer_fops);
@@ -4566,6 +4730,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        tracing_off();
+        /* Did function tracer already get disabled? */
+        if (ftrace_is_dead()) {
+                printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+                printk("#          MAY BE MISSING FUNCTION EVENTS\n");
+        }
        if (disable_tracing)
                ftrace_kill();
@@ -4658,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
        __ftrace_dump(true, oops_dump_mode);
 }
+EXPORT_SYMBOL_GPL(ftrace_dump);
 __init static int tracer_alloc_buffers(void)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846bcfee5..b93ecbadad6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
 void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
-                                 mode_t mode,
+                                 umode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops);
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_latency_header(struct seq_file *m);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
@@ -579,11 +580,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
        return test_tsk_trace_trace(task);
 }
+extern int ftrace_is_dead(void);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
        return 1;
 }
+static inline int ftrace_is_dead(void) { return 0; }
 #endif
 /*
@@ -652,6 +655,7 @@ enum trace_iterator_flags {
        TRACE_ITER_RECORD_CMD           = 0x100000,
        TRACE_ITER_OVERWRITE            = 0x200000,
        TRACE_ITER_STOP_ON_FREE         = 0x400000,
+        TRACE_ITER_IRQ_INFO             = 0x800000,
 };
 /*
@@ -761,16 +765,10 @@ struct filter_pred {
        filter_pred_fn_t        fn;
        u64                     val;
        struct regex            regex;
-        /*
+        unsigned short          *ops;
-         * Leaf nodes use field_name, ops is used by AND and OR
+#ifdef CONFIG_FTRACE_STARTUP_TEST
-         * nodes. The field_name is always freed when freeing a pred.
+        struct ftrace_event_field *field;
-         * We can overload field_name for ops and have it freed
+#endif
-         * as well.
-         */
-        union {
-                char            *field_name;
-                unsigned short  *ops;
-        };
        int                     offset;
        int                     not;
        int                     op;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747a1398..394783531cbb 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void)
        return now;
 }
+static atomic64_t trace_counter;
+/*
+ * trace_clock_counter(): simply an atomic counter.
+ * Use the trace_counter "counter" for cases where you do not care
+ * about timings, but are interested in strict ordering.
+ */
+u64 notrace trace_clock_counter(void)
+{
+        return atomic64_add_return(1, &trace_counter);
+}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 581876f9f387..c212a7f934ec 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
        /* First see if we did not already create this dir */
        list_for_each_entry(system, &event_subsystems, list) {
                if (strcmp(system->name, name) == 0) {
-                        __get_system(system);
                        system->nr_events++;
                        return system->entry;
                }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764ecccd6..f04cc3136bd3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
 #include "trace.h"
 #include "trace_output.h"
+#define DEFAULT_SYS_FILTER_MESSAGE                                      \
+        "### global filter ###\n"                                       \
+        "# Use this to set filters for multiple events.\n"              \
+        "# Only events with the given fields will be affected.\n"       \
+        "# If no events are modified, an error message will be displayed here"
 enum filter_op_ids
 {
        OP_OR,
@@ -381,6 +387,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
        return pred;
 }
+enum walk_return {
+        WALK_PRED_ABORT,
+        WALK_PRED_PARENT,
+        WALK_PRED_DEFAULT,
+};
+typedef int (*filter_pred_walkcb_t) (enum move_type move,
+                                     struct filter_pred *pred,
+                                     int *err, void *data);
+static int walk_pred_tree(struct filter_pred *preds,
+                          struct filter_pred *root,
+                          filter_pred_walkcb_t cb, void *data)
+{
+        struct filter_pred *pred = root;
+        enum move_type move = MOVE_DOWN;
+        int done = 0;
+        if  (!preds)
+                return -EINVAL;
+        do {
+                int err = 0, ret;
+                ret = cb(move, pred, &err, data);
+                if (ret == WALK_PRED_ABORT)
+                        return err;
+                if (ret == WALK_PRED_PARENT)
+                        goto get_parent;
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        goto get_parent;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+ get_parent:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent,
+                                               &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        /* We are fine. */
+        return 0;
+}
 /*
 * A series of AND or ORs where found together. Instead of
 * climbing up and down the tree branches, an array of the
@@ -410,99 +473,91 @@ static int process_ops(struct filter_pred *preds,
        for (i = 0; i < op->val; i++) {
                pred = &preds[op->ops[i]];
-                match = pred->fn(pred, rec);
+                if (!WARN_ON_ONCE(!pred->fn))
+                        match = pred->fn(pred, rec);
                if (!!match == type)
                        return match;
        }
        return match;
 }
+struct filter_match_preds_data {
+        struct filter_pred *preds;
+        int match;
+        void *rec;
+};
+static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
+                                 int *err, void *data)
+{
+        struct filter_match_preds_data *d = data;
+        *err = 0;
+        switch (move) {
+        case MOVE_DOWN:
+                /* only AND and OR have children */
+                if (pred->left != FILTER_PRED_INVALID) {
+                        /* If ops is set, then it was folded. */
+                        if (!pred->ops)
+                                return WALK_PRED_DEFAULT;
+                        /* We can treat folded ops as a leaf node */
+                        d->match = process_ops(d->preds, pred, d->rec);
+                } else {
+                        if (!WARN_ON_ONCE(!pred->fn))
+                                d->match = pred->fn(pred, d->rec);
+                }
+                return WALK_PRED_PARENT;
+        case MOVE_UP_FROM_LEFT:
+                /*
+                 * Check for short circuits.
+                 *
+                 * Optimization: !!match == (pred->op == OP_OR)
+                 *   is the same as:
+                 * if ((match && pred->op == OP_OR) ||
+                 *     (!match && pred->op == OP_AND))
+                 */
+                if (!!d->match == (pred->op == OP_OR))
+                        return WALK_PRED_PARENT;
+                break;
+        case MOVE_UP_FROM_RIGHT:
+                break;
+        }
+        return WALK_PRED_DEFAULT;
+}
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-        int match = -1;
-        enum move_type move = MOVE_DOWN;
        struct filter_pred *preds;
-        struct filter_pred *pred;
        struct filter_pred *root;
-        int n_preds;
+        struct filter_match_preds_data data = {
-        int done = 0;
+                /* match is currently meaningless */
+                .match = -1,
+                .rec   = rec,
+        };
+        int n_preds, ret;
        /* no filter is considered a match */
        if (!filter)
                return 1;
        n_preds = filter->n_preds;
        if (!n_preds)
                return 1;
        /*
         * n_preds, root and filter->preds are protect with preemption disabled.
         */
-        preds = rcu_dereference_sched(filter->preds);
        root = rcu_dereference_sched(filter->root);
        if (!root)
                return 1;
-        pred = root;
+        data.preds = preds = rcu_dereference_sched(filter->preds);
+        ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
-        /* match is currently meaningless */
+        WARN_ON(ret);
-        match = -1;
+        return data.match;
-        do {
-                switch (move) {
-                case MOVE_DOWN:
-                        /* only AND and OR have children */
-                        if (pred->left != FILTER_PRED_INVALID) {
-                                /* If ops is set, then it was folded. */
-                                if (!pred->ops) {
-                                        /* keep going to down the left side */
-                                        pred = &preds[pred->left];
-                                        continue;
-                                }
-                                /* We can treat folded ops as a leaf node */
-                                match = process_ops(preds, pred, rec);
-                        } else
-                                match = pred->fn(pred, rec);
-                        /* If this pred is the only pred */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        /*
-                         * Check for short circuits.
-                         *
-                         * Optimization: !!match == (pred->op == OP_OR)
-                         *   is the same as:
-                         * if ((match && pred->op == OP_OR) ||
-                         *     (!match && pred->op == OP_AND))
-                         */
-                        if (!!match == (pred->op == OP_OR)) {
-                                if (pred == root)
-                                        break;
-                                pred = get_pred_parent(pred, preds,
-                                                       pred->parent, &move);
-                                continue;
-                        }
-                        /* now go down the right side of the tree. */
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        /* We finished this equation. */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -597,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
-                trace_seq_printf(s, "none\n");
+                trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
        mutex_unlock(&event_mutex);
 }
@@ -628,22 +683,6 @@ find_event_field(struct ftrace_event_call *call, char *name)
        return __find_event_field(head, name);
 }
-static void filter_free_pred(struct filter_pred *pred)
-{
-        if (!pred)
-                return;
-        kfree(pred->field_name);
-        kfree(pred);
-}
-static void filter_clear_pred(struct filter_pred *pred)
-{
-        kfree(pred->field_name);
-        pred->field_name = NULL;
-        pred->regex.len = 0;
-}
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
        stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -689,20 +728,13 @@ __pop_pred_stack(struct pred_stack *stack)
 static int filter_set_pred(struct event_filter *filter,
                           int idx,
                           struct pred_stack *stack,
-                           struct filter_pred *src,
+                           struct filter_pred *src)
-                           filter_pred_fn_t fn)
 {
        struct filter_pred *dest = &filter->preds[idx];
        struct filter_pred *left;
        struct filter_pred *right;
        *dest = *src;
-        if (src->field_name) {
-                dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-                if (!dest->field_name)
-                        return -ENOMEM;
-        }
-        dest->fn = fn;
        dest->index = idx;
        if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -743,11 +775,7 @@ static int filter_set_pred(struct event_filter *filter,
 static void __free_preds(struct event_filter *filter)
 {
-        int i;
        if (filter->preds) {
-                for (i = 0; i < filter->a_preds; i++)
-                        kfree(filter->preds[i].field_name);
                kfree(filter->preds);
                filter->preds = NULL;
        }
@@ -840,23 +868,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
        }
 }
-static int filter_add_pred_fn(struct filter_parse_state *ps,
+static int filter_add_pred(struct filter_parse_state *ps,
-                              struct ftrace_event_call *call,
+                           struct event_filter *filter,
-                              struct event_filter *filter,
+                           struct filter_pred *pred,
-                              struct filter_pred *pred,
+                           struct pred_stack *stack)
-                              struct pred_stack *stack,
-                              filter_pred_fn_t fn)
 {
-        int idx, err;
+        int err;
        if (WARN_ON(filter->n_preds == filter->a_preds)) {
                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
                return -ENOSPC;
        }
-        idx = filter->n_preds;
+        err = filter_set_pred(filter, filter->n_preds, stack, pred);
-        filter_clear_pred(&filter->preds[idx]);
-        err = filter_set_pred(filter, idx, stack, pred, fn);
        if (err)
                return err;
@@ -937,31 +961,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
        return fn;
 }
-static int filter_add_pred(struct filter_parse_state *ps,
+static int init_pred(struct filter_parse_state *ps,
-                           struct ftrace_event_call *call,
+                     struct ftrace_event_field *field,
-                           struct event_filter *filter,
+                     struct filter_pred *pred)
-                           struct filter_pred *pred,
-                           struct pred_stack *stack,
-                           bool dry_run)
 {
-        struct ftrace_event_field *field;
+        filter_pred_fn_t fn = filter_pred_none;
-        filter_pred_fn_t fn;
        unsigned long long val;
        int ret;
-        fn = pred->fn = filter_pred_none;
-        if (pred->op == OP_AND)
-                goto add_pred_fn;
-        else if (pred->op == OP_OR)
-                goto add_pred_fn;
-        field = find_event_field(call, pred->field_name);
-        if (!field) {
-                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
-                return -EINVAL;
-        }
        pred->offset = field->offset;
        if (!is_legal_op(field, pred->op)) {
@@ -1001,9 +1009,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
        if (pred->op == OP_NE)
                pred->not = 1;
-add_pred_fn:
+        pred->fn = fn;
-        if (!dry_run)
-                return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
        return 0;
 }
@@ -1302,39 +1308,37 @@ parse_operand:
        return 0;
 }
-static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+static struct filter_pred *create_pred(struct filter_parse_state *ps,
+                                       struct ftrace_event_call *call,
+                                       int op, char *operand1, char *operand2)
 {
-        struct filter_pred *pred;
+        struct ftrace_event_field *field;
+        static struct filter_pred pred;
-        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+        memset(&pred, 0, sizeof(pred));
-        if (!pred)
+        pred.op = op;
-                return NULL;
-        pred->field_name = kstrdup(operand1, GFP_KERNEL);
+        if (op == OP_AND || op == OP_OR)
-        if (!pred->field_name) {
+                return &pred;
-                kfree(pred);
+        if (!operand1 || !operand2) {
+                parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
                return NULL;
        }
-        strcpy(pred->regex.pattern, operand2);
+        field = find_event_field(call, operand1);
-        pred->regex.len = strlen(pred->regex.pattern);
+        if (!field) {
+                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
-        pred->op = op;
-        return pred;
-}
-static struct filter_pred *create_logical_pred(int op)
-{
-        struct filter_pred *pred;
-        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-        if (!pred)
                return NULL;
+        }
-        pred->op = op;
+        strcpy(pred.regex.pattern, operand2);
+        pred.regex.len = strlen(pred.regex.pattern);
-        return pred;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+        pred.field = field;
+#endif
+        return init_pred(ps, field, &pred) ? NULL : &pred;
 }
 static int check_preds(struct filter_parse_state *ps)
@@ -1375,6 +1379,23 @@ static int count_preds(struct filter_parse_state *ps)
        return n_preds;
 }
+struct check_pred_data {
+        int count;
+        int max;
+};
+static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+                              int *err, void *data)
+{
+        struct check_pred_data *d = data;
+        if (WARN_ON(d->count++ > d->max)) {
+                *err = -EINVAL;
+                return WALK_PRED_ABORT;
+        }
+        return WALK_PRED_DEFAULT;
+}
 /*
 * The tree is walked at filtering of an event. If the tree is not correctly
 * built, it may cause an infinite loop. Check here that the tree does
@@ -1383,107 +1404,76 @@ static int count_preds(struct filter_parse_state *ps)
 static int check_pred_tree(struct event_filter *filter,
                           struct filter_pred *root)
 {
-        struct filter_pred *preds;
+        struct check_pred_data data = {
-        struct filter_pred *pred;
+                /*
-        enum move_type move = MOVE_DOWN;
+                 * The max that we can hit a node is three times.
-        int count = 0;
+                 * Once going down, once coming up from left, and
-        int done = 0;
+                 * once coming up from right. This is more than enough
-        int max;
+                 * since leafs are only hit a single time.
+                 */
-        /*
+                .max   = 3 * filter->n_preds,
-         * The max that we can hit a node is three times.
+                .count = 0,
-         * Once going down, once coming up from left, and
+        };
-         * once coming up from right. This is more than enough
-         * since leafs are only hit a single time.
-         */
-        max = 3 * filter->n_preds;
-        preds = filter->preds;
+        return walk_pred_tree(filter->preds, root,
-        if  (!preds)
+                              check_pred_tree_cb, &data);
-                return -EINVAL;
+}
-        pred = root;
-        do {
+static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
-                if (WARN_ON(count++ > max))
+                          int *err, void *data)
-                        return -EINVAL;
+{
+        int *count = data;
-                switch (move) {
+        if ((move == MOVE_DOWN) &&
-                case MOVE_DOWN:
+            (pred->left == FILTER_PRED_INVALID))
-                        if (pred->left != FILTER_PRED_INVALID) {
+                (*count)++;
-                                pred = &preds[pred->left];
-                                continue;
-                        }
-                        /* A leaf at the root is just a leaf in the tree */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        /* We are fine. */
+        return WALK_PRED_DEFAULT;
-        return 0;
 }
 static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
 {
-        struct filter_pred *pred;
+        int count = 0, ret;
-        enum move_type move = MOVE_DOWN;
-        int count = 0;
-        int done = 0;
-        pred = root;
+        ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
+        WARN_ON(ret);
+        return count;
+}
-        do {
+struct fold_pred_data {
-                switch (move) {
+        struct filter_pred *root;
-                case MOVE_DOWN:
+        int count;
-                        if (pred->left != FILTER_PRED_INVALID) {
+        int children;
-                                pred = &preds[pred->left];
+};
-                                continue;
-                        }
-                        /* A leaf at the root is just a leaf in the tree */
-                        if (pred == root)
-                                return 1;
-                        count++;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return count;
+static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
+                        int *err, void *data)
+{
+        struct fold_pred_data *d = data;
+        struct filter_pred *root = d->root;
+        if (move != MOVE_DOWN)
+                return WALK_PRED_DEFAULT;
+        if (pred->left != FILTER_PRED_INVALID)
+                return WALK_PRED_DEFAULT;
+        if (WARN_ON(d->count == d->children)) {
+                *err = -EINVAL;
+                return WALK_PRED_ABORT;
+        }
+        pred->index &= ~FILTER_PRED_FOLD;
+        root->ops[d->count++] = pred->index;
+        return WALK_PRED_DEFAULT;
 }
 static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 {
-        struct filter_pred *pred;
+        struct fold_pred_data data = {
-        enum move_type move = MOVE_DOWN;
+                .root  = root,
-        int count = 0;
+                .count = 0,
+        };
        int children;
-        int done = 0;
        /* No need to keep the fold flag */
        root->index &= ~FILTER_PRED_FOLD;
@@ -1501,37 +1491,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
                return -ENOMEM;
        root->val = children;
+        data.children = children;
+        return walk_pred_tree(preds, root, fold_pred_cb, &data);
+}
-        pred = root;
+static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
-        do {
+                             int *err, void *data)
-                switch (move) {
+{
-                case MOVE_DOWN:
+        struct filter_pred *preds = data;
-                        if (pred->left != FILTER_PRED_INVALID) {
-                                pred = &preds[pred->left];
-                                continue;
-                        }
-                        if (WARN_ON(count == children))
-                                return -EINVAL;
-                        pred->index &= ~FILTER_PRED_FOLD;
-                        root->ops[count++] = pred->index;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return 0;
+        if (move != MOVE_DOWN)
+                return WALK_PRED_DEFAULT;
+        if (!(pred->index & FILTER_PRED_FOLD))
+                return WALK_PRED_DEFAULT;
+        *err = fold_pred(preds, pred);
+        if (*err)
+                return WALK_PRED_ABORT;
+        /* eveyrhing below is folded, continue with parent */
+        return WALK_PRED_PARENT;
 }
 /*
@@ -1542,51 +1521,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 static int fold_pred_tree(struct event_filter *filter,
                           struct filter_pred *root)
 {
-        struct filter_pred *preds;
+        return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
-        struct filter_pred *pred;
+                              filter->preds);
-        enum move_type move = MOVE_DOWN;
-        int done = 0;
-        int err;
-        preds = filter->preds;
-        if  (!preds)
-                return -EINVAL;
-        pred = root;
-        do {
-                switch (move) {
-                case MOVE_DOWN:
-                        if (pred->index & FILTER_PRED_FOLD) {
-                                err = fold_pred(preds, pred);
-                                if (err)
-                                        return err;
-                                /* Folded nodes are like leafs */
-                        } else if (pred->left != FILTER_PRED_INVALID) {
-                                pred = &preds[pred->left];
-                                continue;
-                        }
-                        /* A leaf at the root is just a leaf in the tree */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return 0;
 }
 static int replace_preds(struct ftrace_event_call *call,
@@ -1643,27 +1579,17 @@ static int replace_preds(struct ftrace_event_call *call,
                        goto fail;
                }
-                if (elt->op == OP_AND || elt->op == OP_OR) {
+                pred = create_pred(ps, call, elt->op, operand1, operand2);
-                        pred = create_logical_pred(elt->op);
+                if (!pred) {
-                        goto add_pred;
-                }
-                if (!operand1 || !operand2) {
-                        parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
                        err = -EINVAL;
                        goto fail;
                }
-                pred = create_pred(elt->op, operand1, operand2);
+                if (!dry_run) {
-add_pred:
+                        err = filter_add_pred(ps, filter, pred, &stack);
-                if (!pred) {
+                        if (err)
-                        err = -ENOMEM;
+                                goto fail;
-                        goto fail;
                }
-                err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
-                filter_free_pred(pred);
-                if (err)
-                        goto fail;
                operand1 = operand2 = NULL;
        }
@@ -1729,7 +1655,9 @@ static int replace_system_preds(struct event_subsystem *system,
                 */
                err = replace_preds(call, NULL, ps, filter_string, true);
                if (err)
-                        goto fail;
+                        call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+                else
+                        call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
        }
        list_for_each_entry(call, &ftrace_events, list) {
@@ -1738,6 +1666,9 @@ static int replace_system_preds(struct event_subsystem *system,
                if (strcmp(call->class->system, system->name) != 0)
                        continue;
+                if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
+                        continue;
                filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
                if (!filter_item)
                        goto fail_mem;
@@ -1766,7 +1697,7 @@ static int replace_system_preds(struct event_subsystem *system,
                 * replace the filter for the call.
                 */
                filter = call->filter;
-                call->filter = filter_item->filter;
+                rcu_assign_pointer(call->filter, filter_item->filter);
                filter_item->filter = filter;
                fail = false;
@@ -1821,7 +1752,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                filter = call->filter;
                if (!filter)
                        goto out_unlock;
-                call->filter = NULL;
+                RCU_INIT_POINTER(call->filter, NULL);
                /* Make sure the filter is not being used */
                synchronize_sched();
                __free_filter(filter);
@@ -1862,7 +1793,7 @@ out:
         * string
         */
        tmp = call->filter;
-        call->filter = filter;
+        rcu_assign_pointer(call->filter, filter);
        if (tmp) {
                /* Make sure the call is done with the filter */
                synchronize_sched();
@@ -1913,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        if (!filter)
                goto out;
-        replace_filter_string(filter, filter_string);
+        /* System filters just show a default message */
+        kfree(filter->filter_string);
+        filter->filter_string = NULL;
        /*
         * No event actually uses the system filter
         * we can free it without synchronize_sched().
@@ -1923,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        parse_init(ps, filter_ops, filter_string);
        err = filter_parse(ps);
-        if (err) {
+        if (err)
-                append_filter_err(ps, system->filter);
+                goto err_filter;
-                goto out;
-        }
        err = replace_system_preds(system, ps, filter_string);
        if (err)
-                append_filter_err(ps, system->filter);
+                goto err_filter;
 out:
        filter_opstack_clear(ps);
@@ -1940,6 +1872,11 @@ out_unlock:
        mutex_unlock(&event_mutex);
        return err;
+err_filter:
+        replace_filter_string(filter, filter_string);
+        append_filter_err(ps, system->filter);
+        goto out;
 }
 #ifdef CONFIG_PERF_EVENTS
@@ -1958,17 +1895,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        int err;
        struct event_filter *filter;
        struct filter_parse_state *ps;
-        struct ftrace_event_call *call = NULL;
+        struct ftrace_event_call *call;
        mutex_lock(&event_mutex);
-        list_for_each_entry(call, &ftrace_events, list) {
+        call = event->tp_event;
-                if (call->event.type == event_id)
-                        break;
-        }
        err = -EINVAL;
-        if (&call->list == &ftrace_events)
+        if (!call)
                goto out_unlock;
        err = -EEXIST;
@@ -2012,3 +1946,215 @@ out_unlock:
 #endif /* CONFIG_PERF_EVENTS */
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#define CREATE_TRACE_POINTS
+#include "trace_events_filter_test.h"
+static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
+                           struct event_filter **pfilter)
+{
+        struct event_filter *filter;
+        struct filter_parse_state *ps;
+        int err = -ENOMEM;
+        filter = __alloc_filter();
+        if (!filter)
+                goto out;
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
+                goto free_filter;
+        parse_init(ps, filter_ops, filter_str);
+        err = filter_parse(ps);
+        if (err)
+                goto free_ps;
+        err = replace_preds(call, filter, ps, filter_str, false);
+        if (!err)
+                *pfilter = filter;
+ free_ps:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
+ free_filter:
+        if (err)
+                __free_filter(filter);
+ out:
+        return err;
+}
+#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
+{ \
+        .filter = FILTER, \
+        .rec    = { .a = va, .b = vb, .c = vc, .d = vd, \
+                    .e = ve, .f = vf, .g = vg, .h = vh }, \
+        .match  = m, \
+        .not_visited = nvisit, \
+}
+#define YES 1
+#define NO  0
+static struct test_filter_data_t {
+        char *filter;
+        struct ftrace_raw_ftrace_test_filter rec;
+        int match;
+        char *not_visited;
+} test_filter_data[] = {
+#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
+               "e == 1 && f == 1 && g == 1 && h == 1"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
+        DATA_REC(NO,  0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
+        DATA_REC(NO,  1, 1, 1, 1, 1, 1, 1, 0, ""),
+#undef FILTER
+#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
+               "e == 1 || f == 1 || g == 1 || h == 1"
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+        DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+        DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
+#undef FILTER
+#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
+               "(e == 1 || f == 1) && (g == 1 || h == 1)"
+        DATA_REC(NO,  0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
+        DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+        DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
+        DATA_REC(NO,  1, 0, 1, 0, 0, 1, 0, 0, "bd"),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
+               "(e == 1 && f == 1) || (g == 1 && h == 1)"
+        DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
+        DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
+               "(e == 1 && f == 1) || (g == 1 && h == 1)"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+        DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
+#undef FILTER
+#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
+               "(e == 1 || f == 1)) && (g == 1 || h == 1)"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+        DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
+#undef FILTER
+#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
+               "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
+        DATA_REC(NO,  0, 1, 0, 1, 0, 1, 0, 1, ""),
+        DATA_REC(NO,  1, 0, 1, 0, 1, 0, 1, 0, ""),
+#undef FILTER
+#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
+               "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
+        DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+        DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
+};
+#undef DATA_REC
+#undef FILTER
+#undef YES
+#undef NO
+#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+static int test_pred_visited;
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+        struct ftrace_event_field *field = pred->field;
+        test_pred_visited = 1;
+        printk(KERN_INFO "\npred visited %s\n", field->name);
+        return 1;
+}
+static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
+                             int *err, void *data)
+{
+        char *fields = data;
+        if ((move == MOVE_DOWN) &&
+            (pred->left == FILTER_PRED_INVALID)) {
+                struct ftrace_event_field *field = pred->field;
+                if (!field) {
+                        WARN(1, "all leafs should have field defined");
+                        return WALK_PRED_DEFAULT;
+                }
+                if (!strchr(fields, *field->name))
+                        return WALK_PRED_DEFAULT;
+                WARN_ON(!pred->fn);
+                pred->fn = test_pred_visited_fn;
+        }
+        return WALK_PRED_DEFAULT;
+}
+static __init int ftrace_test_event_filter(void)
+{
+        int i;
+        printk(KERN_INFO "Testing ftrace filter: ");
+        for (i = 0; i < DATA_CNT; i++) {
+                struct event_filter *filter = NULL;
+                struct test_filter_data_t *d = &test_filter_data[i];
+                int err;
+                err = test_get_filter(d->filter, &event_ftrace_test_filter,
+                                      &filter);
+                if (err) {
+                        printk(KERN_INFO
+                               "Failed to get filter for '%s', err %d\n",
+                               d->filter, err);
+                        break;
+                }
+                /*
+                 * The preemption disabling is not really needed for self
+                 * tests, but the rcu dereference will complain without it.
+                 */
+                preempt_disable();
+                if (*d->not_visited)
+                        walk_pred_tree(filter->preds, filter->root,
+                                       test_walk_pred_cb,
+                                       d->not_visited);
+                test_pred_visited = 0;
+                err = filter_match_preds(filter, &d->rec);
+                preempt_enable();
+                __free_filter(filter);
+                if (test_pred_visited) {
+                        printk(KERN_INFO
+                               "Failed, unwanted pred visited for filter %s\n",
+                               d->filter);
+                        break;
+                }
+                if (err != d->match) {
+                        printk(KERN_INFO
+                               "Failed to match filter '%s', expected %d\n",
+                               d->filter, d->match);
+                        break;
+                }
+        }
+        if (i == DATA_CNT)
+                printk(KERN_CONT "OK\n");
+        return 0;
+}
+late_initcall(ftrace_test_event_filter);
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 000000000000..bfd4dba0d603
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM test
+#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TEST_H
+#include <linux/tracepoint.h>
+TRACE_EVENT(ftrace_test_filter,
+        TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
+        TP_ARGS(a, b, c, d, e, f, g, h),
+        TP_STRUCT__entry(
+                __field(int, a)
+                __field(int, b)
+                __field(int, c)
+                __field(int, d)
+                __field(int, e)
+                __field(int, f)
+                __field(int, g)
+                __field(int, h)
+        ),
+        TP_fast_assign(
+                __entry->a = a;
+                __entry->b = b;
+                __entry->c = c;
+                __entry->d = d;
+                __entry->e = e;
+                __entry->f = f;
+                __entry->g = g;
+                __entry->h = h;
+        ),
+        TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
+                  __entry->a, __entry->b, __entry->c, __entry->d,
+                  __entry->e, __entry->f, __entry->g, __entry->h)
+);
+#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_events_filter_test
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cfc..99d20e920368 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,7 +23,7 @@ static int				tracer_enabled __read_mostly;
 static DEFINE_PER_CPU(int, tracing_cpu);
-static DEFINE_SPINLOCK(max_trace_lock);
+static DEFINE_RAW_SPINLOCK(max_trace_lock);
 enum {
        TRACER_IRQS_OFF         = (1 << 1),
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 }
 static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
-static void irqsoff_print_header(struct seq_file *s) { }
 static void irqsoff_trace_open(struct trace_iterator *iter) { }
 static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#ifdef CONFIG_FUNCTION_TRACER
+static void irqsoff_print_header(struct seq_file *s)
+{
+        trace_default_header(s);
+}
+#else
+static void irqsoff_print_header(struct seq_file *s)
+{
+        trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
@@ -321,7 +332,7 @@ check_critical_timing(struct trace_array *tr,
        if (!report_latency(delta))
                goto out;
-        spin_lock_irqsave(&max_trace_lock, flags);
+        raw_spin_lock_irqsave(&max_trace_lock, flags);
        /* check if we are still the max latency */
        if (!report_latency(delta))
@@ -344,7 +355,7 @@ check_critical_timing(struct trace_array *tr,
        max_sequence++;
 out_unlock:
-        spin_unlock_irqrestore(&max_trace_lock, flags);
+        raw_spin_unlock_irqrestore(&max_trace_lock, flags);
 out:
        data->critical_sequence = max_sequence;
@@ -505,13 +516,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-        if (preempt_trace())
+        if (preempt_trace() && !irq_trace())
                stop_critical_timing(a0, a1);
 }
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-        if (preempt_trace())
+        if (preempt_trace() && !irq_trace())
                start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697bf0e5..00d527c945a4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp)
 }
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static void unregister_trace_probe(struct trace_probe *tp)
+static int unregister_trace_probe(struct trace_probe *tp)
 {
+        /* Enabled event can not be unregistered */
+        if (trace_probe_is_enabled(tp))
+                return -EBUSY;
        __unregister_trace_probe(tp);
        list_del(&tp->list);
        unregister_probe_event(tp);
+        return 0;
 }
 /* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp)
        /* Delete old (same name) event if exist */
        old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
        if (old_tp) {
-                unregister_trace_probe(old_tp);
+                ret = unregister_trace_probe(old_tp);
+                if (ret < 0)
+                        goto end;
                free_trace_probe(old_tp);
        }
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb,
        mutex_lock(&probe_lock);
        list_for_each_entry(tp, &probe_list, list) {
                if (trace_probe_within_module(tp, mod)) {
+                        /* Don't need to check busy - this should have gone. */
                        __unregister_trace_probe(tp);
                        ret = __register_trace_probe(tp);
                        if (ret)
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv)
                        return -ENOENT;
                }
                /* delete an event */
-                unregister_trace_probe(tp);
+                ret = unregister_trace_probe(tp);
-                free_trace_probe(tp);
+                if (ret == 0)
+                        free_trace_probe(tp);
                mutex_unlock(&probe_lock);
-                return 0;
+                return ret;
        }
        if (argc < 2) {
@@ -1317,18 +1327,29 @@ error:
        return ret;
 }
-static void release_all_trace_probes(void)
+static int release_all_trace_probes(void)
 {
        struct trace_probe *tp;
+        int ret = 0;
        mutex_lock(&probe_lock);
+        /* Ensure no probe is in use. */
+        list_for_each_entry(tp, &probe_list, list)
+                if (trace_probe_is_enabled(tp)) {
+                        ret = -EBUSY;
+                        goto end;
+                }
        /* TODO: Use batch unregistration */
        while (!list_empty(&probe_list)) {
                tp = list_entry(probe_list.next, struct trace_probe, list);
                unregister_trace_probe(tp);
                free_trace_probe(tp);
        }
+end:
        mutex_unlock(&probe_lock);
+        return ret;
 }
 /* Probes listing interfaces */
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
 static int probes_open(struct inode *inode, struct file *file)
 {
-        if ((file->f_mode & FMODE_WRITE) &&
+        int ret;
-            (file->f_flags & O_TRUNC))
-                release_all_trace_probes();
+        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+                ret = release_all_trace_probes();
+                if (ret < 0)
+                        return ret;
+        }
        return seq_open(file, &probes_seq_op);
 }
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
        ret = target(1, 2, 3, 4, 5, 6);
+        /* Disable trace points before removing it */
+        tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+        if (WARN_ON_ONCE(tp == NULL)) {
+                pr_warning("error on getting test probe.\n");
+                warn++;
+        } else
+                disable_trace_probe(tp, TP_FLAG_TRACE);
+        tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+        if (WARN_ON_ONCE(tp == NULL)) {
+                pr_warning("error on getting 2nd test probe.\n");
+                warn++;
+        } else
+                disable_trace_probe(tp, TP_FLAG_TRACE);
        ret = command_trace_probe("-:testprobe");
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on deleting a probe.\n");
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6cf..0d6ff3555942 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
        unsigned long usec_rem = do_div(t, USEC_PER_SEC);
        unsigned long secs = (unsigned long)t;
        char comm[TASK_COMM_LEN];
+        int ret;
        trace_find_cmdline(entry->pid, comm);
-        return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
+        ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
-                                comm, entry->pid, iter->cpu, secs, usec_rem);
+                               comm, entry->pid, iter->cpu);
+        if (!ret)
+                return 0;
+        if (trace_flags & TRACE_ITER_IRQ_INFO) {
+                ret = trace_print_lat_fmt(s, entry);
+                if (!ret)
+                        return 0;
+        }
+        return trace_seq_printf(s, " %5lu.%06lu: ",
+                                secs, usec_rem);
 }
 int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468a10d7..6fd4ffd042f9 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
                        continue;
                }
+                fmt = NULL;
                tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
-                if (tb_fmt)
+                if (tb_fmt) {
                        fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
-                if (tb_fmt && fmt) {
+                        if (fmt) {
-                        list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+                                list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-                        strcpy(fmt, *iter);
+                                strcpy(fmt, *iter);
-                        tb_fmt->fmt = fmt;
+                                tb_fmt->fmt = fmt;
-                        *iter = tb_fmt->fmt;
+                        } else
-                } else {
+                                kfree(tb_fmt);
-                        kfree(tb_fmt);
-                        *iter = NULL;
                }
+                *iter = fmt;
        }
        mutex_unlock(&btrace_mutex);
 }
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b6..ff791ea48b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 }
 static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
-static void wakeup_print_header(struct seq_file *s) { }
 static void wakeup_trace_open(struct trace_iterator *iter) { }
 static void wakeup_trace_close(struct trace_iterator *iter) { }
+#ifdef CONFIG_FUNCTION_TRACER
+static void wakeup_print_header(struct seq_file *s)
+{
+        trace_default_header(s);
+}
+#else
+static void wakeup_print_header(struct seq_file *s)
+{
+        trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0bb9f8..cb654542c1a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,6 +2,7 @@
 #include <trace/events/syscalls.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/module.h>       /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f1449c54..db110b8ae030 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
 static const int tracepoint_debug;
 /*
- * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
+ * Tracepoints mutex protects the builtin and module tracepoints and the hash
- * builtin and module tracepoints and the hash table.
+ * table, as well as the local module list.
 */
 static DEFINE_MUTEX(tracepoints_mutex);
+#ifdef CONFIG_MODULES
+/* Local list of struct module */
+static LIST_HEAD(tracepoint_module_list);
+#endif /* CONFIG_MODULES */
 /*
 * Tracepoint hash table, containing the active tracepoints.
 * Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem)
 * @end: end of the range
 *
 * Updates the probe callback corresponding to a range of tracepoints.
+ * Called with tracepoints_mutex held.
 */
-void tracepoint_update_probe_range(struct tracepoint * const *begin,
+static void tracepoint_update_probe_range(struct tracepoint * const *begin,
-                                   struct tracepoint * const *end)
+                                          struct tracepoint * const *end)
 {
        struct tracepoint * const *iter;
        struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
        if (!begin)
                return;
-        mutex_lock(&tracepoints_mutex);
        for (iter = begin; iter < end; iter++) {
                mark_entry = get_tracepoint((*iter)->name);
                if (mark_entry) {
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
                        disable_tracepoint(*iter);
                }
        }
-        mutex_unlock(&tracepoints_mutex);
 }
+#ifdef CONFIG_MODULES
+void module_update_tracepoints(void)
+{
+        struct tp_module *tp_mod;
+        list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+                tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
+                        tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
+}
+#else /* CONFIG_MODULES */
+void module_update_tracepoints(void)
+{
+}
+#endif /* CONFIG_MODULES */
 /*
 * Update probes, removing the faulty probes.
+ * Called with tracepoints_mutex held.
 */
 static void tracepoint_update_probes(void)
 {
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
        mutex_lock(&tracepoints_mutex);
        old = tracepoint_add_probe(name, probe, data);
-        mutex_unlock(&tracepoints_mutex);
+        if (IS_ERR(old)) {
-        if (IS_ERR(old))
+                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
+        }
        tracepoint_update_probes();             /* may update entry */
+        mutex_unlock(&tracepoints_mutex);
        release_probes(old);
        return 0;
 }
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
        mutex_lock(&tracepoints_mutex);
        old = tracepoint_remove_probe(name, probe, data);
-        mutex_unlock(&tracepoints_mutex);
+        if (IS_ERR(old)) {
-        if (IS_ERR(old))
+                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
+        }
        tracepoint_update_probes();             /* may update entry */
+        mutex_unlock(&tracepoints_mutex);
        release_probes(old);
        return 0;
 }
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void)
        if (!list_empty(&old_probes))
                list_replace_init(&old_probes, &release_probes);
        need_update = 0;
-        mutex_unlock(&tracepoints_mutex);
        tracepoint_update_probes();
+        mutex_unlock(&tracepoints_mutex);
        list_for_each_entry_safe(pos, next, &release_probes, u.list) {
                list_del(&pos->u.list);
                call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
 * Will return the first tracepoint in the range if the input tracepoint is
 * NULL.
 */
-int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
        struct tracepoint * const *begin, struct tracepoint * const *end)
 {
        if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
                return 1;
        return 0;
 }
-EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
+#ifdef CONFIG_MODULES
 static void tracepoint_get_iter(struct tracepoint_iter *iter)
 {
        int found = 0;
+        struct tp_module *iter_mod;
        /* Core kernel tracepoints */
        if (!iter->module) {
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
                if (found)
                        goto end;
        }
-        /* tracepoints in modules. */
+        /* Tracepoints in modules */
-        found = module_get_iter_tracepoints(iter);
+        mutex_lock(&tracepoints_mutex);
+        list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
+                /*
+                 * Sorted module list
+                 */
+                if (iter_mod < iter->module)
+                        continue;
+                else if (iter_mod > iter->module)
+                        iter->tracepoint = NULL;
+                found = tracepoint_get_iter_range(&iter->tracepoint,
+                        iter_mod->tracepoints_ptrs,
+                        iter_mod->tracepoints_ptrs
+                                + iter_mod->num_tracepoints);
+                if (found) {
+                        iter->module = iter_mod;
+                        break;
+                }
+        }
+        mutex_unlock(&tracepoints_mutex);
 end:
        if (!found)
                tracepoint_iter_reset(iter);
 }
+#else /* CONFIG_MODULES */
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+        int found = 0;
+        /* Core kernel tracepoints */
+        found = tracepoint_get_iter_range(&iter->tracepoint,
+                        __start___tracepoints_ptrs,
+                        __stop___tracepoints_ptrs);
+        if (!found)
+                tracepoint_iter_reset(iter);
+}
+#endif /* CONFIG_MODULES */
 void tracepoint_iter_start(struct tracepoint_iter *iter)
 {
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
 void tracepoint_iter_reset(struct tracepoint_iter *iter)
 {
+#ifdef CONFIG_MODULES
        iter->module = NULL;
+#endif /* CONFIG_MODULES */
        iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 #ifdef CONFIG_MODULES
+static int tracepoint_module_coming(struct module *mod)
+{
+        struct tp_module *tp_mod, *iter;
+        int ret = 0;
+        /*
+         * We skip modules that tain the kernel, especially those with different
+         * module header (for forced load), to make sure we don't cause a crash.
+         */
+        if (mod->taints)
+                return 0;
+        mutex_lock(&tracepoints_mutex);
+        tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
+        if (!tp_mod) {
+                ret = -ENOMEM;
+                goto end;
+        }
+        tp_mod->num_tracepoints = mod->num_tracepoints;
+        tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
+        /*
+         * tracepoint_module_list is kept sorted by struct module pointer
+         * address for iteration on tracepoints from a seq_file that can release
+         * the mutex between calls.
+         */
+        list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
+                BUG_ON(iter == tp_mod); /* Should never be in the list twice */
+                if (iter < tp_mod) {
+                        /* We belong to the location right after iter. */
+                        list_add(&tp_mod->list, &iter->list);
+                        goto module_added;
+                }
+        }
+        /* We belong to the beginning of the list */
+        list_add(&tp_mod->list, &tracepoint_module_list);
+module_added:
+        tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                mod->tracepoints_ptrs + mod->num_tracepoints);
+end:
+        mutex_unlock(&tracepoints_mutex);
+        return ret;
+}
+static int tracepoint_module_going(struct module *mod)
+{
+        struct tp_module *pos;
+        mutex_lock(&tracepoints_mutex);
+        tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                mod->tracepoints_ptrs + mod->num_tracepoints);
+        list_for_each_entry(pos, &tracepoint_module_list, list) {
+                if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
+                        list_del(&pos->list);
+                        kfree(pos);
+                        break;
+                }
+        }
+        /*
+         * In the case of modules that were tainted at "coming", we'll simply
+         * walk through the list without finding it. We cannot use the "tainted"
+         * flag on "going", in case a module taints the kernel only after being
+         * loaded.
+         */
+        mutex_unlock(&tracepoints_mutex);
+        return 0;
+}
 int tracepoint_module_notify(struct notifier_block *self,
                             unsigned long val, void *data)
 {
        struct module *mod = data;
+        int ret = 0;
        switch (val) {
        case MODULE_STATE_COMING:
+                ret = tracepoint_module_coming(mod);
+                break;
+        case MODULE_STATE_LIVE:
+                break;
        case MODULE_STATE_GOING:
-                tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                ret = tracepoint_module_going(mod);
-                        mod->tracepoints_ptrs + mod->num_tracepoints);
                break;
        }
-        return 0;
+        return ret;
 }
 struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@ static int init_tracepoints(void)
        return register_module_notifier(&tracepoint_module_nb);
 }
 __initcall(init_tracepoints);
 #endif /* CONFIG_MODULES */
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
                local_irq_save(flags);
                time = tsk->stime + tsk->utime;
-                dtime = cputime_sub(time, tsk->acct_timexpd);
+                dtime = time - tsk->acct_timexpd;
                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
                delta = value.tv_sec;
                delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/up.c b/kernel/up.c
index 1ff27a28bb7d..c54c75e9faf7 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -4,7 +4,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/smp.h>
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 92cb706c7fc8..1744bb80f1fb 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -2,7 +2,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
diff --git a/kernel/user.c b/kernel/user.c
index 9e03e9c1df8d..71dd2363ab0f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,7 +14,7 @@
 #include <linux/bitops.h>
 #include <linux/key.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/user_namespace.h>
 /*
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9da289c34f22..3b906e98b1db 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -5,7 +5,7 @@
 *  License.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index bff131b9510a..405caf91aad5 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -9,7 +9,7 @@
 *  License.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/err.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e70d4d..63da38c2d820 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -9,10 +9,11 @@
 *  License.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
        uts_table.data = get_uts(table, write);
        r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
        put_uts(table, write, uts_table.data);
+        if (write)
+                proc_sys_poll_notify(table->poll);
        return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
+static DEFINE_CTL_TABLE_POLL(hostname_poll);
+static DEFINE_CTL_TABLE_POLL(domainname_poll);
 static struct ctl_table uts_kern_table[] = {
        {
                .procname       = "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.nodename),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
+                .poll           = &hostname_poll,
        },
        {
                .procname       = "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.domainname),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
+                .poll           = &domainname_poll,
        },
        {}
 };
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
        {}
 };
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+        struct ctl_table *table = &uts_kern_table[proc];
+        proc_sys_poll_notify(table->poll);
+}
+#endif
 static int __init utsname_sysctl_init(void)
 {
        register_sysctl_table(uts_root_table);
diff --git a/kernel/wait.c b/kernel/wait.c
index f45ea8d2a1ce..7fdd9eaca2c3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -4,16 +4,16 @@
 * (C) 2004 William Irwin, Oracle
 */
 #include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
-void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
 {
        spin_lock_init(&q->lock);
-        lockdep_set_class(&q->lock, key);
+        lockdep_set_class_and_name(&q->lock, key, name);
        INIT_LIST_HEAD(&q->task_list);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd5b7d4..1d7bca7f4f52 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 */
 static int watchdog(void *unused)
 {
-        static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
        sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@ static int watchdog(void *unused)
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);
+        param.sched_priority = 0;
+        sched_setscheduler(current, SCHED_NORMAL, &param);
        return 0;
 }
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu)
        /* create the watchdog thread */
        if (!p) {
-                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+                p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
                        if (!err) {
@@ -480,6 +481,8 @@ static void watchdog_disable(int cpu)
        }
 }
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
 static void watchdog_enable_all_cpus(void)
 {
        int cpu;
@@ -509,8 +512,6 @@ static void watchdog_disable_all_cpus(void)
 }
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
 /*
 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1783aabc6128..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
 * Please read Documentation/workqueue.txt for details.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -242,10 +242,10 @@ struct workqueue_struct {
        int                     nr_drainers;    /* W: drain in progress */
        int                     saved_max_active; /* W: saved cwq max_active */
-        const char              *name;          /* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map      lockdep_map;
 #endif
+        char                    name[];         /* I: workqueue name */
 };
 struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
        return clamp_val(max_active, 1, lim);
 }
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                                               unsigned int flags,
                                               int max_active,
                                               struct lock_class_key *key,
-                                               const char *lock_name)
+                                               const char *lock_name, ...)
 {
+        va_list args, args1;
        struct workqueue_struct *wq;
        unsigned int cpu;
+        size_t namelen;
+        /* determine namelen, allocate wq and format name */
+        va_start(args, lock_name);
+        va_copy(args1, args);
+        namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+        wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+        if (!wq)
+                goto err;
+        vsnprintf(wq->name, namelen, fmt, args1);
+        va_end(args);
+        va_end(args1);
        /*
         * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
                flags |= WQ_HIGHPRI;
        max_active = max_active ?: WQ_DFL_ACTIVE;
-        max_active = wq_clamp_max_active(max_active, flags, name);
+        max_active = wq_clamp_max_active(max_active, flags, wq->name);
-        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-        if (!wq)
-                goto err;
+        /* init wq */
        wq->flags = flags;
        wq->saved_max_active = max_active;
        mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
-        wq->name = name;
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
        INIT_LIST_HEAD(&wq->list);
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
                if (!rescuer)
                        goto err;
-                rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+                rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+                                               wq->name);
                if (IS_ERR(rescuer->task))
                        goto err;