Merge commit 'v2.6.34-rc7' into perf/nmi

Merge reason: catch up with latest softlockup detector changes.
author: Frederic Weisbecker <fweisbec@gmail.com> 2010-05-12 17:19:01 -0400
committer: Frederic Weisbecker <fweisbec@gmail.com> 2010-05-12 17:20:33 -0400
commit: a9aa1d02de36b450990b0e25a88fc2ff1c3e6b94 (patch)
tree: 1f9d19f1642d263e65906a916a48be9339accc73 /kernel
parent: 5671a10e2bc7f99d9157c6044faf8be2ef302361 (diff)
parent: b57f95a38233a2e73b679bea4a5453a1cc2a1cc9 (diff)
125 files changed, 7261 insertions, 4172 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 8a5abe53ebad..d5c30060ac14 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o
+            async.o range.o
+obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -91,6 +92,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
@@ -101,6 +105,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
 }
 /**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
-        memset(pacct, 0, sizeof(struct pacct_struct));
-        pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-/**
 * acct_collect - collect accounting information into pacct_struct
 * @exitcode: task exit code
 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        skb_get(skb);
        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
        if (err < 0) {
-                BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
                audit_log_lost("auditd dissapeared\n");
                audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 struct audit_tree;
 struct audit_chunk;
@@ -548,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
        return 0;
 }
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+        return mnt->mnt_root->d_inode == arg;
+}
 void audit_trim_trees(void)
 {
        struct list_head cursor;
@@ -559,7 +565,6 @@ void audit_trim_trees(void)
                struct path path;
                struct vfsmount *root_mnt;
                struct node *node;
-                struct list_head list;
                int err;
                tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +582,16 @@ void audit_trim_trees(void)
                if (!root_mnt)
                        goto skip_it;
-                list_add_tail(&list, &root_mnt->mnt_list);
                spin_lock(&hash_lock);
                list_for_each_entry(node, &tree->chunks, list) {
-                        struct audit_chunk *chunk = find_chunk(node);
+                        struct inode *inode = find_chunk(node)->watch.inode;
-                        struct inode *inode = chunk->watch.inode;
-                        struct vfsmount *mnt;
                        node->index |= 1U<<31;
-                        list_for_each_entry(mnt, &list, mnt_list) {
+                        if (iterate_mounts(compare_root, inode, root_mnt))
-                                if (mnt->mnt_root->d_inode == inode) {
+                                node->index &= ~(1U<<31);
-                                        node->index &= ~(1U<<31);
-                                        break;
-                                }
-                        }
                }
                spin_unlock(&hash_lock);
                trim_marked(tree);
                put_tree(tree);
-                list_del_init(&list);
                drop_collected_mounts(root_mnt);
 skip_it:
                mutex_lock(&audit_filter_mutex);
@@ -603,22 +600,6 @@ skip_it:
        mutex_unlock(&audit_filter_mutex);
 }
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
-                    struct path *path)
-{
-        if (mnt != path->mnt) {
-                for (;;) {
-                        if (mnt->mnt_parent == mnt)
-                                return 0;
-                        if (mnt->mnt_parent == path->mnt)
-                                        break;
-                        mnt = mnt->mnt_parent;
-                }
-                dentry = mnt->mnt_mountpoint;
-        }
-        return is_subdir(dentry, path->dentry);
-}
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 {
@@ -638,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree)
        put_tree(tree);
 }
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+        return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
 /* called with audit_filter_mutex */
 int audit_add_tree_rule(struct audit_krule *rule)
 {
        struct audit_tree *seed = rule->tree, *tree;
        struct path path;
-        struct vfsmount *mnt, *p;
+        struct vfsmount *mnt;
-        struct list_head list;
        int err;
        list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
                err = -ENOMEM;
                goto Err;
        }
-        list_add_tail(&list, &mnt->mnt_list);
        get_tree(tree);
-        list_for_each_entry(p, &list, mnt_list) {
+        err = iterate_mounts(tag_mount, tree, mnt);
-                err = tag_chunk(p->mnt_root->d_inode, tree);
-                if (err)
-                        break;
-        }
-        list_del(&list);
        drop_collected_mounts(mnt);
        if (!err) {
@@ -714,31 +692,23 @@ int audit_tag_tree(char *old, char *new)
 {
        struct list_head cursor, barrier;
        int failed = 0;
-        struct path path;
+        struct path path1, path2;
        struct vfsmount *tagged;
-        struct list_head list;
-        struct vfsmount *mnt;
-        struct dentry *dentry;
        int err;
-        err = kern_path(new, 0, &path);
+        err = kern_path(new, 0, &path2);
        if (err)
                return err;
-        tagged = collect_mounts(&path);
+        tagged = collect_mounts(&path2);
-        path_put(&path);
+        path_put(&path2);
        if (!tagged)
                return -ENOMEM;
-        err = kern_path(old, 0, &path);
+        err = kern_path(old, 0, &path1);
        if (err) {
                drop_collected_mounts(tagged);
                return err;
        }
-        mnt = mntget(path.mnt);
-        dentry = dget(path.dentry);
-        path_put(&path);
-        list_add_tail(&list, &tagged->mnt_list);
        mutex_lock(&audit_filter_mutex);
        list_add(&barrier, &tree_list);
@@ -746,7 +716,7 @@ int audit_tag_tree(char *old, char *new)
        while (cursor.next != &tree_list) {
                struct audit_tree *tree;
-                struct vfsmount *p;
+                int good_one = 0;
                tree = container_of(cursor.next, struct audit_tree, list);
                get_tree(tree);
@@ -754,30 +724,19 @@ int audit_tag_tree(char *old, char *new)
                list_add(&cursor, &tree->list);
                mutex_unlock(&audit_filter_mutex);
-                err = kern_path(tree->pathname, 0, &path);
+                err = kern_path(tree->pathname, 0, &path2);
-                if (err) {
+                if (!err) {
-                        put_tree(tree);
+                        good_one = path_is_under(&path1, &path2);
-                        mutex_lock(&audit_filter_mutex);
+                        path_put(&path2);
-                        continue;
                }
-                spin_lock(&vfsmount_lock);
+                if (!good_one) {
-                if (!is_under(mnt, dentry, &path)) {
-                        spin_unlock(&vfsmount_lock);
-                        path_put(&path);
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
                        continue;
                }
-                spin_unlock(&vfsmount_lock);
-                path_put(&path);
-                list_for_each_entry(p, &list, mnt_list) {
-                        failed = tag_chunk(p->mnt_root->d_inode, tree);
-                        if (failed)
-                                break;
-                }
+                failed = iterate_mounts(tag_mount, tree, tagged);
                if (failed) {
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
@@ -818,10 +777,8 @@ int audit_tag_tree(char *old, char *new)
        }
        list_del(&barrier);
        list_del(&cursor);
-        list_del(&list);
        mutex_unlock(&audit_filter_mutex);
-        dput(dentry);
+        path_put(&path1);
-        mntput(mnt);
        drop_collected_mounts(tagged);
        return failed;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
 #include <linux/mqueue.h>
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
 {
        if (context->name_count >= AUDIT_NAMES) {
                if (inode)
-                        printk(KERN_DEBUG "name_count maxed, losing inode data: "
+                        printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
                               "dev=%02x:%02x, inode=%lu\n",
                               MAJOR(inode->i_sb->s_dev),
                               MINOR(inode->i_sb->s_dev),
@@ -1988,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 /**
 * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
 * @dentry: dentry being audited
 * @parent: inode of dentry parent
 *
@@ -2000,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 * must be hooked prior, in order to capture the target inode during
 * unsuccessful attempts.
 */
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
+void __audit_inode_child(const struct dentry *dentry,
                         const struct inode *parent)
 {
        int idx;
        struct audit_context *context = current->audit_context;
        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
+        const char *dname = dentry->d_name.name;
        int dirlen = 0;
        if (!context->in_syscall)
@@ -2014,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
        if (inode)
                handle_one(inode);
-        /* determine matching parent */
-        if (!dname)
-                goto add_names;
        /* parent is more likely, look for it first */
        for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
        if (pid && (pid != task_pid_vnr(current))) {
                struct task_struct *target;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                target = find_task_by_vpid(pid);
                if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                else
                        ret = security_capget(target, pEp, pIp, pPp);
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044a..3a53c771e503 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
+#include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
@@ -51,15 +56,21 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <asm/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
 #define SUBSYS(_x) &_x ## _subsys,
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
-static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
@@ -146,6 +157,35 @@ struct css_id {
        unsigned short stack[0]; /* Array of Length (depth+1) */
 };
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+        /*
+         * Cgroup which the event belongs to.
+         */
+        struct cgroup *cgrp;
+        /*
+         * Control file which the event associated.
+         */
+        struct cftype *cft;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 /* The list of hierarchy roots */
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 */
 static int need_forkexit_callback __read_mostly;
+#ifdef CONFIG_PROVE_LOCKING
+int cgroup_lock_is_held(void)
+{
+        return lockdep_is_held(&cgroup_mutex);
+}
+#else /* #ifdef CONFIG_PROVE_LOCKING */
+int cgroup_lock_is_held(void)
+{
+        return mutex_is_locked(&cgroup_mutex);
+}
+#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
+EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+                           struct cgroup_subsys_state *css);
 /* css_set_lock protects the list of css_set objects, and the
 * chain of tasks off each css_set.  Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
        struct hlist_node *node;
        struct css_set *cg;
-        /* Built the set of subsystem state objects that we want to
+        /*
-         * see in the new css_set */
+         * Build the set of subsystem state objects that we want to see in the
+         * new css_set. while subsystems can change globally, the entries here
+         * won't change, so no need for locking.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                if (root->subsys_bits & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
 {
        mutex_lock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_lock);
 /**
 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
 {
        mutex_unlock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_unlock);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
                        if (ret)
                                break;
                }
        return ret;
 }
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
        css_put(css);
 }
+/*
+ * Call with cgroup_mutex held. Drops reference counts on modules, including
+ * any duplicate ones that parse_cgroupfs_options took. If this function
+ * returns an error, no reference counts are touched.
+ */
 static int rebind_subsystems(struct cgroupfs_root *root,
                              unsigned long final_bits)
 {
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        struct cgroup *cgrp = &root->top_cgroup;
        int i;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        removed_bits = root->actual_subsys_bits & ~final_bits;
        added_bits = final_bits & ~root->actual_subsys_bits;
        /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                struct cgroup_subsys *ss = subsys[i];
                if (!(bit & added_bits))
                        continue;
+                /*
+                 * Nobody should tell us to do a subsys that doesn't exist:
+                 * parse_cgroupfs_options should catch that case and refcounts
+                 * ensure that subsystems won't disappear once selected.
+                 */
+                BUG_ON(ss == NULL);
                if (ss->root != &rootnode) {
                        /* Subsystem isn't free */
                        return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                unsigned long bit = 1UL << i;
                if (bit & added_bits) {
                        /* We're binding this subsystem to this hierarchy */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        if (ss->bind)
                                ss->bind(ss, cgrp);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                        mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* subsystem is now free - drop reference on module */
+                        module_put(ss->module);
                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
+                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
+                        /*
+                         * a refcount was taken, but we already had one, so
+                         * drop the extra reference.
+                         */
+                        module_put(ss->module);
+#ifdef CONFIG_MODULE_UNLOAD
+                        BUG_ON(ss->module && !module_refcount(ss->module));
+#endif
                } else {
                        /* Subsystem state shouldn't exist */
                        BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
 };
-/* Convert a hierarchy specifier into a bitmask of subsystems and
+/*
- * flags. */
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
-static int parse_cgroupfs_options(char *data,
+ * with cgroup_mutex held to protect the subsys[] array. This function takes
-                                     struct cgroup_sb_opts *opts)
+ * refcounts on subsystems to be used, unless it returns error, in which case
+ * no refcounts are taken.
+ */
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data ?: "all";
        unsigned long mask = (unsigned long)-1;
+        int i;
+        bool module_pin_failed = false;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
        mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
                        return -EINVAL;
                if (!strcmp(token, "all")) {
                        /* Add all non-disabled subsystems */
-                        int i;
                        opts->subsys_bits = 0;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                struct cgroup_subsys *ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!ss->disabled)
                                        opts->subsys_bits |= 1ul << i;
                        }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
                        if (!opts->release_agent)
                                return -ENOMEM;
                } else if (!strncmp(token, "name=", 5)) {
-                        int i;
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
                                return -ENOMEM;
                } else {
                        struct cgroup_subsys *ss;
-                        int i;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!strcmp(token, ss->name)) {
                                        if (!ss->disabled)
                                                set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
        if (!opts->subsys_bits && !opts->name)
                return -EINVAL;
+        /*
+         * Grab references on all the modules we'll need, so the subsystems
+         * don't dance around before rebind_subsystems attaches them. This may
+         * take duplicate reference counts on a subsystem that's already used,
+         * but rebind_subsystems handles this case.
+         */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & opts->subsys_bits))
+                        continue;
+                if (!try_module_get(subsys[i]->module)) {
+                        module_pin_failed = true;
+                        break;
+                }
+        }
+        if (module_pin_failed) {
+                /*
+                 * oops, one of the modules was going away. this means that we
+                 * raced with a module_delete call, and to the user this is
+                 * essentially a "subsystem doesn't exist" case.
+                 */
+                for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+                        /* drop refcounts only on the ones we took */
+                        unsigned long bit = 1UL << i;
+                        if (!(bit & opts->subsys_bits))
+                                continue;
+                        module_put(subsys[i]->module);
+                }
+                return -ENOENT;
+        }
        return 0;
 }
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+{
+        int i;
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & subsys_bits))
+                        continue;
+                module_put(subsys[i]->module);
+        }
+}
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
        int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        /* Don't allow flags to change at remount */
+        /* Don't allow flags or name to change at remount */
-        if (opts.flags != root->flags) {
+        if (opts.flags != root->flags ||
-                ret = -EINVAL;
+            (opts.name && strcmp(opts.name, root->name))) {
-                goto out_unlock;
-        }
-        /* Don't allow name to change at remount */
-        if (opts.name && strcmp(opts.name, root->name)) {
                ret = -EINVAL;
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
        }
        ret = rebind_subsystems(root, opts.subsys_bits);
-        if (ret)
+        if (ret) {
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
+        }
        /* (re)populate subsystem files */
        cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
+        INIT_LIST_HEAD(&cgrp->event_list);
+        spin_lock_init(&cgrp->event_list_lock);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct cgroupfs_root *new_root;
        /* First find the desired set of subsystems */
+        mutex_lock(&cgroup_mutex);
        ret = parse_cgroupfs_options(data, &opts);
+        mutex_unlock(&cgroup_mutex);
        if (ret)
                goto out_err;
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        new_root = cgroup_root_from_opts(&opts);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
-                goto out_err;
+                goto drop_modules;
        }
        opts.new_root = new_root;
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
-                goto out_err;
+                goto drop_modules;
        }
        root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                        free_cg_links(&tmp_cg_links);
                        goto drop_new_super;
                }
+                /*
+                 * There must be no failure case after here, since rebinding
+                 * takes care of subsystems' refcounts, which are explicitly
+                 * dropped in the failure exit path.
+                 */
                /* EBUSY should be the only error here */
                BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 * any) is not needed
                 */
                cgroup_drop_root(opts.new_root);
+                /* no subsys rebinding, so refcounts don't change */
+                drop_parsed_module_refcounts(opts.subsys_bits);
        }
        simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 drop_new_super:
        deactivate_locked_super(sb);
+ drop_modules:
+        drop_parsed_module_refcounts(opts.subsys_bits);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
-        struct dentry *dentry = rcu_dereference(cgrp->dentry);
+        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
+                                                      rcu_read_lock_held() ||
+                                                      cgroup_lock_is_held());
        if (!dentry || cgrp == dummytop) {
                /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        *--start = '\0';
        for (;;) {
                int len = dentry->d_name.len;
                if ((start -= len) < buf)
                        return -ENAMETOOLONG;
-                memcpy(start, cgrp->dentry->d_name.name, len);
+                memcpy(start, dentry->d_name.name, len);
                cgrp = cgrp->parent;
                if (!cgrp)
                        break;
-                dentry = rcu_dereference(cgrp->dentry);
+                dentry = rcu_dereference_check(cgrp->dentry,
+                                               rcu_read_lock_held() ||
+                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        memmove(buf, start, buf + buflen - start);
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_path);
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        int retval = 0;
-        struct cgroup_subsys *ss;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct css_set *cg;
        struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(ss, cgrp, tsk, false);
-                        if (retval)
+                        if (retval) {
-                                return retval;
+                                /*
+                                 * Remember on which subsystem the can_attach()
+                                 * failed, so that we only call cancel_attach()
+                                 * against the subsystems whose can_attach()
+                                 * succeeded. (See below)
+                                 */
+                                failed_ss = ss;
+                                goto out;
+                        }
                }
        }
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         */
        newcg = find_css_set(cg, cgrp);
        put_css_set(cg);
-        if (!newcg)
+        if (!newcg) {
-                return -ENOMEM;
+                retval = -ENOMEM;
+                goto out;
+        }
        task_lock(tsk);
        if (tsk->flags & PF_EXITING) {
                task_unlock(tsk);
                put_css_set(newcg);
-                return -ESRCH;
+                retval = -ESRCH;
+                goto out;
        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         * is no longer empty.
         */
        cgroup_wakeup_rmdir_waiter(cgrp);
-        return 0;
+out:
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss)
+                                /*
+                                 * This subsystem was the one that failed the
+                                 * can_attach() check earlier, so we don't need
+                                 * to call cancel_attach() against it or any
+                                 * remaining subsystems.
+                                 */
+                                break;
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, tsk, false);
+                }
+        }
+        return retval;
 }
 /*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
        }
        return true;
 }
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+                return ERR_PTR(-EINVAL);
+        return __d_cft(file->f_dentry);
+}
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
                error = PTR_ERR(dentry);
        return error;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_file);
 int cgroup_add_files(struct cgroup *cgrp,
                        struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_files);
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+        struct pid_namespace *ns = current->nsproxy->pid_ns;
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
         * the last ref-holder is trying to remove l from the list at the same
@@ -2463,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        mutex_lock(&cgrp->pidlist_mutex);
        list_for_each_entry(l, &cgrp->pidlists, links) {
                if (l->key.type == type && l->key.ns == ns) {
-                        /* found a matching list - drop the extra refcount */
-                        put_pid_ns(ns);
                        /* make sure l doesn't vanish out from under us */
                        down_write(&l->mutex);
                        mutex_unlock(&cgrp->pidlist_mutex);
@@ -2475,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
        if (!l) {
                mutex_unlock(&cgrp->pidlist_mutex);
-                put_pid_ns(ns);
                return l;
        }
        init_rwsem(&l->mutex);
        down_write(&l->mutex);
        l->key.type = type;
-        l->key.ns = ns;
+        l->key.ns = get_pid_ns(ns);
        l->use_count = 0; /* don't increment here */
        l->list = NULL;
        l->owner = cgrp;
@@ -2789,6 +2984,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+        struct cgroup_event *event = container_of(work, struct cgroup_event,
+                        remove);
+        struct cgroup *cgrp = event->cgrp;
+        /* TODO: check return code */
+        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        eventfd_ctx_put(event->eventfd);
+        kfree(event);
+        dput(cgrp->dentry);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+                int sync, void *key)
+{
+        struct cgroup_event *event = container_of(wait,
+                        struct cgroup_event, wait);
+        struct cgroup *cgrp = event->cgrp;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                remove_wait_queue_locked(event->wqh, &event->wait);
+                spin_lock(&cgrp->event_list_lock);
+                list_del(&event->list);
+                spin_unlock(&cgrp->event_list_lock);
+                /*
+                 * We are in atomic context, but cgroup_event_remove() may
+                 * sleep, so we have to call it in workqueue.
+                 */
+                schedule_work(&event->remove);
+        }
+        return 0;
+}
+static void cgroup_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct cgroup_event *event = container_of(pt,
+                        struct cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        struct cgroup_event *event = NULL;
+        unsigned int efd, cfd;
+        struct file *efile = NULL;
+        struct file *cfile = NULL;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->cgrp = cgrp;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+        INIT_WORK(&event->remove, cgroup_event_remove);
+        efile = eventfd_fget(efd);
+        if (IS_ERR(efile)) {
+                ret = PTR_ERR(efile);
+                goto fail;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto fail;
+        }
+        cfile = fget(cfd);
+        if (!cfile) {
+                ret = -EBADF;
+                goto fail;
+        }
+        /* the process need read permission on control file */
+        ret = file_permission(cfile, MAY_READ);
+        if (ret < 0)
+                goto fail;
+        event->cft = __file_cft(cfile);
+        if (IS_ERR(event->cft)) {
+                ret = PTR_ERR(event->cft);
+                goto fail;
+        }
+        if (!event->cft->register_event || !event->cft->unregister_event) {
+                ret = -EINVAL;
+                goto fail;
+        }
+        ret = event->cft->register_event(cgrp, event->cft,
+                        event->eventfd, buffer);
+        if (ret)
+                goto fail;
+        if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+                event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+                ret = 0;
+                goto fail;
+        }
+        /*
+         * Events should be removed after rmdir of cgroup directory, but before
+         * destroying subsystem state objects. Let's take reference to cgroup
+         * directory dentry to do that.
+         */
+        dget(cgrp->dentry);
+        spin_lock(&cgrp->event_list_lock);
+        list_add(&event->list, &cgrp->event_list);
+        spin_unlock(&cgrp->event_list_lock);
+        fput(cfile);
+        fput(efile);
+        return 0;
+fail:
+        if (cfile)
+                fput(cfile);
+        if (event && event->eventfd && !IS_ERR(event->eventfd))
+                eventfd_ctx_put(event->eventfd);
+        if (!IS_ERR_OR_NULL(efile))
+                fput(efile);
+        kfree(event);
+        return ret;
+}
+/*
 * for the common functions, 'private' gives the type of file
 */
 /* for hysterical raisins, we can't put this on the older files */
@@ -2813,6 +3176,11 @@ static struct cftype files[] = {
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
+        {
+                .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+                .write_string = cgroup_write_event_control,
+                .mode = S_IWUGO,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -2877,8 +3245,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
        /* We need to take each hierarchy_mutex in a consistent order */
        int i;
+        /*
+         * No worry about a race with rebind_subsystems that might mess up the
+         * locking order, since both parties are under cgroup_mutex.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_lock(&ss->hierarchy_mutex);
        }
@@ -2890,6 +3264,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_unlock(&ss->hierarchy_mutex);
        }
@@ -2936,14 +3312,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_destroy;
                }
                init_cgroup_css(css, ss, cgrp);
-                if (ss->use_id)
+                if (ss->use_id) {
-                        if (alloc_css_id(ss, parent, cgrp))
+                        err = alloc_css_id(ss, parent, cgrp);
+                        if (err)
                                goto err_destroy;
+                }
                /* At error, ->destroy() callback has to free assigned ID. */
        }
@@ -3010,11 +3389,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
         * synchronization other than RCU, and the subsystem linked
         * list isn't RCU-safe */
        int i;
+        /*
+         * We won't need to lock the subsys array, because the subsystems
+         * we're concerned about aren't going anywhere since our cgroup root
+         * has a reference on them.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
-                /* Skip subsystems not in this hierarchy */
+                /* Skip subsystems not present or not in this hierarchy */
-                if (ss->root != cgrp->root)
+                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
                /* When called from check_for_release() it's possible
@@ -3088,6 +3472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct dentry *d;
        struct cgroup *parent;
        DEFINE_WAIT(wait);
+        struct cgroup_event *event, *tmp;
        int ret;
        /* the vfs holds both inode->i_mutex already */
@@ -3171,6 +3556,20 @@ again:
        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
+        /*
+         * Unregister events and notify userspace.
+         * Notify userspace about cgroup removing only after rmdir of cgroup
+         * directory to avoid race between userspace and kernelspace
+         */
+        spin_lock(&cgrp->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+                list_del(&event->list);
+                remove_wait_queue(event->wqh, &event->wait);
+                eventfd_signal(event->eventfd, 1);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&cgrp->event_list_lock);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -3205,9 +3604,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        mutex_init(&ss->hierarchy_mutex);
        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
+        /* this function shouldn't be used with modular subsystems, since they
+         * need to register a subsys_id, among other things */
+        BUG_ON(ss->module);
 }
 /**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsytem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+        int i;
+        struct cgroup_subsys_state *css;
+        /* check name and function validity */
+        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
+            ss->create == NULL || ss->destroy == NULL)
+                return -EINVAL;
+        /*
+         * we don't support callbacks in modular subsystems. this check is
+         * before the ss->module check for consistency; a subsystem that could
+         * be a module should still have no callbacks even if the user isn't
+         * compiling it as one.
+         */
+        if (ss->fork || ss->exit)
+                return -EINVAL;
+        /*
+         * an optionally modular subsystem is built-in: we want to do nothing,
+         * since cgroup_init_subsys will have already taken care of it.
+         */
+        if (ss->module == NULL) {
+                /* a few sanity checks */
+                BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+                BUG_ON(subsys[ss->subsys_id] != ss);
+                return 0;
+        }
+        /*
+         * need to register a subsys id before anything else - for example,
+         * init_cgroup_css needs it.
+         */
+        mutex_lock(&cgroup_mutex);
+        /* find the first empty slot in the array */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                if (subsys[i] == NULL)
+                        break;
+        }
+        if (i == CGROUP_SUBSYS_COUNT) {
+                /* maximum number of subsystems already registered! */
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        /* assign ourselves the subsys_id */
+        ss->subsys_id = i;
+        subsys[i] = ss;
+        /*
+         * no ss->create seems to need anything important in the ss struct, so
+         * this can happen first (i.e. before the rootnode attachment).
+         */
+        css = ss->create(ss, dummytop);
+        if (IS_ERR(css)) {
+                /* failure case - need to deassign the subsys[] slot. */
+                subsys[i] = NULL;
+                mutex_unlock(&cgroup_mutex);
+                return PTR_ERR(css);
+        }
+        list_add(&ss->sibling, &rootnode.subsys_list);
+        ss->root = &rootnode;
+        /* our new subsystem will be attached to the dummy hierarchy. */
+        init_cgroup_css(css, ss, dummytop);
+        /* init_idr must be after init_cgroup_css because it sets css->id. */
+        if (ss->use_id) {
+                int ret = cgroup_init_idr(ss, css);
+                if (ret) {
+                        dummytop->subsys[ss->subsys_id] = NULL;
+                        ss->destroy(ss, dummytop);
+                        subsys[i] = NULL;
+                        mutex_unlock(&cgroup_mutex);
+                        return ret;
+                }
+        }
+        /*
+         * Now we need to entangle the css into the existing css_sets. unlike
+         * in cgroup_init_subsys, there are now multiple css_sets, so each one
+         * will need a new pointer to it; done by iterating the css_set_table.
+         * furthermore, modifying the existing css_sets will corrupt the hash
+         * table state, so each changed css_set will need its hash recomputed.
+         * this is all done under the css_set_lock.
+         */
+        write_lock(&css_set_lock);
+        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                struct css_set *cg;
+                struct hlist_node *node, *tmp;
+                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                        /* skip entries that we already rehashed */
+                        if (cg->subsys[ss->subsys_id])
+                                continue;
+                        /* remove existing entry */
+                        hlist_del(&cg->hlist);
+                        /* set new value */
+                        cg->subsys[ss->subsys_id] = css;
+                        /* recompute hash and restore entry */
+                        new_bucket = css_set_hash(cg->subsys);
+                        hlist_add_head(&cg->hlist, new_bucket);
+                }
+        }
+        write_unlock(&css_set_lock);
+        mutex_init(&ss->hierarchy_mutex);
+        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+        ss->active = 1;
+        /* success! */
+        mutex_unlock(&cgroup_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
+/**
+ * cgroup_unload_subsys: unload a modular subsystem
+ * @ss: the subsystem to unload
+ *
+ * This function should be called in a modular subsystem's exitcall. When this
+ * function is invoked, the refcount on the subsystem's module will be 0, so
+ * the subsystem will not be attached to any hierarchy.
+ */
+void cgroup_unload_subsys(struct cgroup_subsys *ss)
+{
+        struct cg_cgroup_link *link;
+        struct hlist_head *hhead;
+        BUG_ON(ss->module == NULL);
+        /*
+         * we shouldn't be called if the subsystem is in use, and the use of
+         * try_module_get in parse_cgroupfs_options should ensure that it
+         * doesn't start being used while we're killing it off.
+         */
+        BUG_ON(ss->root != &rootnode);
+        mutex_lock(&cgroup_mutex);
+        /* deassign the subsys_id */
+        BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
+        subsys[ss->subsys_id] = NULL;
+        /* remove subsystem from rootnode's list of subsystems */
+        list_del(&ss->sibling);
+        /*
+         * disentangle the css from all css_sets attached to the dummytop. as
+         * in loading, we need to pay our respects to the hashtable gods.
+         */
+        write_lock(&css_set_lock);
+        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+                struct css_set *cg = link->cg;
+                hlist_del(&cg->hlist);
+                BUG_ON(!cg->subsys[ss->subsys_id]);
+                cg->subsys[ss->subsys_id] = NULL;
+                hhead = css_set_hash(cg->subsys);
+                hlist_add_head(&cg->hlist, hhead);
+        }
+        write_unlock(&css_set_lock);
+        /*
+         * remove subsystem's css from the dummytop and free it - need to free
+         * before marking as null because ss->destroy needs the cgrp->subsys
+         * pointer to find their state. note that this also takes care of
+         * freeing the css_id.
+         */
+        ss->destroy(ss, dummytop);
+        dummytop->subsys[ss->subsys_id] = NULL;
+        mutex_unlock(&cgroup_mutex);
+}
+EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
+/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
@@ -3235,7 +3823,8 @@ int __init cgroup_init_early(void)
        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&css_set_table[i]);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                BUG_ON(!ss->name);
@@ -3270,12 +3859,13 @@ int __init cgroup_init(void)
        if (err)
                return err;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
-                        cgroup_subsys_init_idr(ss);
+                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
        /* Add init_css_set to the hash table */
@@ -3379,9 +3969,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        int i;
        seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+        /*
+         * ideally we don't want subsystems moving around while we do this.
+         * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+         * subsys/hierarchy state.
+         */
        mutex_lock(&cgroup_mutex);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
@@ -3439,7 +4036,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
        if (need_forkexit_callback) {
                int i;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * forkexit callbacks are only supported for builtin
+                 * subsystems, and the builtin section of the subsys array is
+                 * immutable, so we don't need to lock the subsys array here.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->fork)
                                ss->fork(ss, child);
@@ -3508,7 +4110,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        struct css_set *cg;
        if (run_callbacks && need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * modular subsystems can't use callbacks, so no need to lock
+                 * the subsys array
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->exit)
                                ss->exit(ss, tsk);
@@ -3702,12 +4308,13 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
-void __css_put(struct cgroup_subsys_state *css)
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css, int count)
 {
        struct cgroup *cgrp = css->cgroup;
        int val;
        rcu_read_lock();
-        val = atomic_dec_return(&css->refcnt);
+        val = atomic_sub_return(count, &css->refcnt);
        if (val == 1) {
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3718,6 +4325,7 @@ void __css_put(struct cgroup_subsys_state *css)
        rcu_read_unlock();
        WARN_ON_ONCE(val < 1);
 }
+EXPORT_SYMBOL_GPL(__css_put);
 /*
 * Notify userspace when a cgroup is released, by running the
@@ -3799,8 +4407,11 @@ static int __init cgroup_disable(char *str)
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
+                /*
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                 * cgroup_disable, being at boot time, can't know about module
+                 * subsystems, so we don't worry about them.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (!strcmp(token, ss->name)) {
@@ -3830,6 +4441,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
                return cssid->id;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_id);
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
@@ -3839,6 +4451,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
                return cssid->depth;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_depth);
 bool css_is_ancestor(struct cgroup_subsys_state *child,
                    const struct cgroup_subsys_state *root)
@@ -3875,6 +4488,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_unlock(&ss->id_lock);
        call_rcu(&id->rcu_head, __free_css_id_cb);
 }
+EXPORT_SYMBOL_GPL(free_css_id);
 /*
 * This is called by init or create(). Then, calls to this function are
@@ -3924,15 +4538,14 @@ err_out:
 }
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+                                            struct cgroup_subsys_state *rootcss)
 {
        struct css_id *newid;
-        struct cgroup_subsys_state *rootcss;
        spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
-        rootcss = init_css_set.subsys[ss->subsys_id];
        newid = get_new_cssid(ss, 0);
        if (IS_ERR(newid))
                return PTR_ERR(newid);
@@ -3948,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 {
        int subsys_id, i, depth = 0;
        struct cgroup_subsys_state *parent_css, *child_css;
-        struct css_id *child_id, *parent_id = NULL;
+        struct css_id *child_id, *parent_id;
        subsys_id = ss->subsys_id;
        parent_css = parent->subsys[subsys_id];
        child_css = child->subsys[subsys_id];
-        depth = css_depth(parent_css) + 1;
        parent_id = parent_css->id;
+        depth = parent_id->depth;
        child_id = get_new_cssid(ss, depth);
        if (IS_ERR(child_id))
@@ -3992,6 +4605,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
        return rcu_dereference(cssid->css);
 }
+EXPORT_SYMBOL_GPL(css_lookup);
 /**
 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..e5c0244962b0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-int cgroup_frozen(struct task_struct *task)
+int cgroup_freezing_or_frozen(struct task_struct *task)
 {
        struct freezer *freezer;
        enum freezer_state state;
        task_lock(task);
        freezer = task_freezer(task);
-        state = freezer->state;
+        if (!freezer->css.cgroup->parent)
+                state = CGROUP_THAWED; /* root cgroup can't be frozen */
+        else
+                state = freezer->state;
        task_unlock(task);
-        return state == CGROUP_FROZEN;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 /*
@@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
         * No lock is needed, since the task isn't on tasklist yet,
         * so it can't be moved to another cgroup, which means the
         * freezer won't be removed and will be valid during this
-         * function call.
+         * function call.  Nevertheless, apply RCU read-side critical
+         * section to suppress RCU lockdep false positives.
         */
+        rcu_read_lock();
        freezer = task_freezer(task);
+        rcu_read_unlock();
        /*
         * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..7f40e9275fd9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/times.h>
 #include <linux/ptrace.h>
+#include <linux/gfp.h>
 #include <asm/uaccess.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee940..25bba73b1be3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <linux/gfp.h>
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -151,13 +152,13 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
-                if (task_cpu(p) == cpu &&
+                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
                    (!cputime_eq(p->utime, cputime_zero) ||
                     !cputime_eq(p->stime, cputime_zero)))
-                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
-                                (state = %ld, flags = %x) \n",
+                                "(state = %ld, flags = %x)\n",
-                                 p->comm, task_pid_nr(p), cpu,
+                                p->comm, task_pid_nr(p), cpu,
-                                 p->state, p->flags);
+                                p->state, p->flags);
        }
        write_unlock_irq(&tasklist_lock);
 }
@@ -338,7 +339,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (!cpu_possible(cpu)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_IA64)
                printk(KERN_ERR "please check additional_cpus= boot "
                                "parameter\n");
 #endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..d10946748ec2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    call to guarantee_online_mems(), as we know no one is changing
 *    our task's cpuset.
 *
- *    Hold callback_mutex around the two modifications of our tasks
- *    mems_allowed to synchronize with cpuset_mems_allowed().
- *
 *    While the mm_struct we are migrating is typically from some
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        nodemask_t newmems;
+        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+        if (!newmems)
+                return;
        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, &newmems);
+        guarantee_online_mems(cs, newmems);
        task_lock(p);
-        cpuset_change_task_nodemask(p, &newmems);
+        cpuset_change_task_nodemask(p, newmems);
        task_unlock(p);
+        NODEMASK_FREE(newmems);
        mm = get_task_mm(p);
        if (!mm)
                return;
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
 {
-        nodemask_t oldmem;
+        NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
        int retval;
        struct ptr_heap heap;
+        if (!oldmem)
+                return -ENOMEM;
        /*
         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
         * it's read-only
         */
-        if (cs == &top_cpuset)
+        if (cs == &top_cpuset) {
-                return -EACCES;
+                retval = -EACCES;
+                goto done;
+        }
        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                if (!nodes_subset(trialcs->mems_allowed,
-                                node_states[N_HIGH_MEMORY]))
+                                node_states[N_HIGH_MEMORY])) {
-                        return -EINVAL;
+                        retval =  -EINVAL;
+                        goto done;
+                }
        }
-        oldmem = cs->mems_allowed;
+        *oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+        if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
                goto done;
        }
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        update_tasks_nodemask(cs, &oldmem, &heap);
+        update_tasks_nodemask(cs, oldmem, &heap);
        heap_free(&heap);
 done:
+        NODEMASK_FREE(oldmem);
        return retval;
 }
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
                          struct cgroup *oldcont, struct task_struct *tsk,
                          bool threadgroup)
 {
-        nodemask_t from, to;
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
+        NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
+        if (from == NULL || to == NULL)
+                goto alloc_fail;
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
-                to = node_possible_map;
        } else {
                guarantee_online_cpus(cs, cpus_attach);
-                guarantee_online_mems(cs, &to);
        }
+        guarantee_online_mems(cs, to);
        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, &to, cs);
+        cpuset_attach_task(tsk, to, cs);
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, &to, cs);
+                        cpuset_attach_task(c, to, cs);
                }
                rcu_read_unlock();
        }
        /* change mm; only needs to be done once even if threadgroup */
-        from = oldcs->mems_allowed;
+        *from = oldcs->mems_allowed;
-        to = cs->mems_allowed;
+        *to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, &to);
+                mpol_rebind_mm(mm, to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, &from, &to);
+                        cpuset_migrate_mm(mm, from, to);
                mmput(mm);
        }
+alloc_fail:
+        NODEMASK_FREE(from);
+        NODEMASK_FREE(to);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-        nodemask_t mask;
+        NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+        int retval;
+        if (mask == NULL)
+                return -ENOMEM;
        mutex_lock(&callback_mutex);
-        mask = cs->mems_allowed;
+        *mask = cs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        return nodelist_scnprintf(page, PAGE_SIZE, mask);
+        retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+        NODEMASK_FREE(mask);
+        return retval;
 }
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
        struct cgroup *cont;
-        nodemask_t oldmems;
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return;
        list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
-                oldmems = cp->mems_allowed;
+                *oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems, NULL);
+                        update_tasks_nodemask(cp, oldmems, NULL);
                }
        }
+        NODEMASK_FREE(oldmems);
 }
 /*
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return NOTIFY_DONE;
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-        case MEM_OFFLINE:
+                *oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
-                if (action == MEM_OFFLINE)
+                update_tasks_nodemask(&top_cpuset, oldmems, NULL);
-                        scan_for_empty_cpusets(&top_cpuset);
+                break;
+        case MEM_OFFLINE:
+                /*
+                 * needn't update top_cpuset.mems_allowed explicitly because
+                 * scan_for_empty_cpusets() will update it.
+                 */
+                scan_for_empty_cpusets(&top_cpuset);
                break;
        default:
                break;
        }
        cgroup_unlock();
+        NODEMASK_FREE(oldmems);
        return NOTIFY_OK;
 }
 #endif
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..62af1816c235 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,6 +10,7 @@
 */
 #include <linux/module.h>
 #include <linux/cred.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/key.h>
 #include <linux/keyctl.h>
@@ -224,7 +225,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_KEYS
        new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
        if (!new->tgcred) {
-                kfree(new);
+                kmem_cache_free(cred_jar, new);
                return NULL;
        }
        atomic_set(&new->tgcred->usage, 1);
@@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void)
        new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
        if (!new)
-                return NULL;
+                goto free_tgcred;
        kdebug("prepare_usermodehelper_creds() alloc %p", new);
@@ -398,6 +399,12 @@ struct cred *prepare_usermodehelper_creds(void)
 error:
        put_cred(new);
        return NULL;
+free_tgcred:
+#ifdef CONFIG_KEYS
+        kfree(tgcred);
+#endif
+        return NULL;
 }
 /*
@@ -786,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred)
 {
        if (cred->magic != CRED_MAGIC)
                return true;
-        if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
-                return true;
 #ifdef CONFIG_SECURITY_SELINUX
        if (selinux_is_enabled()) {
                if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..31aa9332ef3f
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,584 @@
+/*
+ * early_res, could be used to replace bootmem
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/early_res.h>
+/*
+ * Early reserved memory areas.
+ */
+/*
+ * need to make sure this one is bigger enough before
+ * find_fw_memmap_area could be used
+ */
+#define MAX_EARLY_RES_X 32
+struct early_res {
+        u64 start, end;
+        char name[15];
+        char overlap_ok;
+};
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata;
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                if (end > r->start && start < r->end)
+                        break;
+        }
+        return i;
+}
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+        int j;
+        for (j = i + 1; j < max_early_res && early_res[j].end; j++)
+                ;
+        memmove(&early_res[i], &early_res[i + 1],
+               (j - 1 - i) * sizeof(struct early_res));
+        early_res[j - 1].end = 0;
+        early_res_count--;
+}
+static void __init drop_range_partial(int i, u64 start, u64 end)
+{
+        u64 common_start, common_end;
+        u64 old_start, old_end;
+        old_start = early_res[i].start;
+        old_end = early_res[i].end;
+        common_start = max(old_start, start);
+        common_end = min(old_end, end);
+        /* no overlap ? */
+        if (common_start >= common_end)
+                return;
+        if (old_start < common_start) {
+                /* make head segment */
+                early_res[i].end = common_start;
+                if (old_end > common_end) {
+                        char name[15];
+                        /*
+                         * Save a local copy of the name, since the
+                         * early_res array could get resized inside
+                         * reserve_early_without_check() ->
+                         * __check_and_double_early_res(), which would
+                         * make the current name pointer invalid.
+                         */
+                        strncpy(name, early_res[i].name,
+                                         sizeof(early_res[i].name) - 1);
+                        /* add another for left over on tail */
+                        reserve_early_without_check(common_end, old_end, name);
+                }
+                return;
+        } else {
+                if (old_end > common_end) {
+                        /* reuse the entry for tail left */
+                        early_res[i].start = common_end;
+                        return;
+                }
+                /* all covered */
+                drop_range(i);
+        }
+}
+/*
+ * Split any existing ranges that:
+ *  1) are marked 'overlap_ok', and
+ *  2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range.  Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        u64 lower_start, lower_end;
+        u64 upper_start, upper_end;
+        char name[15];
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                /* Continue past non-overlapping ranges */
+                if (end <= r->start || start >= r->end)
+                        continue;
+                /*
+                 * Leave non-ok overlaps as is; let caller
+                 * panic "Overlapping early reservations"
+                 * when it hits this overlap.
+                 */
+                if (!r->overlap_ok)
+                        return;
+                /*
+                 * We have an ok overlap.  We will drop it from the early
+                 * reservation map, and add back in any non-overlapping
+                 * portions (lower or upper) as separate, overlap_ok,
+                 * non-overlapping ranges.
+                 */
+                /* 1. Note any non-overlapping (lower or upper) ranges. */
+                strncpy(name, r->name, sizeof(name) - 1);
+                lower_start = lower_end = 0;
+                upper_start = upper_end = 0;
+                if (r->start < start) {
+                        lower_start = r->start;
+                        lower_end = start;
+                }
+                if (r->end > end) {
+                        upper_start = end;
+                        upper_end = r->end;
+                }
+                /* 2. Drop the original ok overlapping range */
+                drop_range(i);
+                i--;            /* resume for-loop on copied down entry */
+                /* 3. Add back in any non-overlapping ranges. */
+                if (lower_end)
+                        reserve_early_overlap_ok(lower_start, lower_end, name);
+                if (upper_end)
+                        reserve_early_overlap_ok(upper_start, upper_end, name);
+        }
+}
+static void __init __reserve_early(u64 start, u64 end, char *name,
+                                                int overlap_ok)
+{
+        int i;
+        struct early_res *r;
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                panic("Too many early reservations");
+        r = &early_res[i];
+        if (r->end)
+                panic("Overlapping early reservations "
+                      "%llx-%llx %s to %llx-%llx %s\n",
+                      start, end - 1, name ? name : "", r->start,
+                      r->end - 1, r->name);
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = overlap_ok;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first.  We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 1);
+}
+static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
+{
+        u64 start, end, size, mem;
+        struct early_res *new;
+        /* do we have enough slots left ? */
+        if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+                return;
+        /* double it */
+        mem = -1ULL;
+        size = sizeof(struct early_res) * max_early_res * 2;
+        if (early_res == early_res_x)
+                start = 0;
+        else
+                start = early_res[0].end;
+        end = ex_start;
+        if (start + size < end)
+                mem = find_fw_memmap_area(start, end, size,
+                                         sizeof(struct early_res));
+        if (mem == -1ULL) {
+                start = ex_end;
+                end = get_max_mapped();
+                if (start + size < end)
+                        mem = find_fw_memmap_area(start, end, size,
+                                                 sizeof(struct early_res));
+        }
+        if (mem == -1ULL)
+                panic("can not find more space for early_res array");
+        new = __va(mem);
+        /* save the first one for own */
+        new[0].start = mem;
+        new[0].end = mem + size;
+        new[0].overlap_ok = 0;
+        /* copy old to new */
+        if (early_res == early_res_x) {
+                memcpy(&new[1], &early_res[0],
+                         sizeof(struct early_res) * max_early_res);
+                memset(&new[max_early_res+1], 0,
+                         sizeof(struct early_res) * (max_early_res - 1));
+                early_res_count++;
+        } else {
+                memcpy(&new[1], &early_res[1],
+                         sizeof(struct early_res) * (max_early_res - 1));
+                memset(&new[max_early_res], 0,
+                         sizeof(struct early_res) * max_early_res);
+        }
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = new;
+        max_early_res *= 2;
+        printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+                max_early_res, mem, mem + size - 1);
+}
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 0);
+}
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+        struct early_res *r;
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        r = &early_res[early_res_count];
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = 0;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+void __init free_early(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+        i = find_overlapped_early(start, end);
+        r = &early_res[i];
+        if (i >= max_early_res || r->end != end || r->start != start)
+                panic("free_early on not reserved area: %llx-%llx!",
+                         start, end - 1);
+        drop_range(i);
+}
+void __init free_early_partial(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+        if (start == end)
+                return;
+        if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
+                return;
+try_next:
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                return;
+        r = &early_res[i];
+        /* hole ? */
+        if (r->end >= end && r->start <= start) {
+                drop_range_partial(i, start, end);
+                return;
+        }
+        drop_range_partial(i, start, end);
+        goto try_next;
+}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+#define DEBUG_PRINT_EARLY_RES 1
+#if DEBUG_PRINT_EARLY_RES
+        printk(KERN_INFO "Subtract (%d early reservations)\n", count);
+#endif
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+#if DEBUG_PRINT_EARLY_RES
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
+                        r->start, r->end, r->name);
+#endif
+                final_start = PFN_DOWN(r->start);
+                final_end = PFN_UP(r->end);
+                if (final_start >= final_end)
+                        continue;
+                subtract_range(range, az, final_start, final_end);
+        }
+}
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+        int i, count;
+        u64 start = 0, end;
+        u64 size;
+        u64 mem;
+        struct range *range;
+        int nr_range;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        count *= 2;
+        size = sizeof(struct range) * count;
+        end = get_max_mapped();
+#ifdef MAX_DMA32_PFN
+        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
+                start = MAX_DMA32_PFN << PAGE_SHIFT;
+#endif
+        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
+        if (mem == -1ULL)
+                panic("can not find more space for range free");
+        range = __va(mem);
+        /* use early_node_map[] and early_res to get range array at first */
+        memset(range, 0, size);
+        nr_range = 0;
+        /* need to go over early_node_map to find out good range for node */
+        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+#ifdef CONFIG_X86_32
+        subtract_range(range, count, max_low_pfn, -1ULL);
+#endif
+        subtract_early_res(range, count);
+        nr_range = clean_sort_range(range, count);
+        /* need to clear it ? */
+        if (nodeid == MAX_NUMNODES) {
+                memset(&early_res[0], 0,
+                         sizeof(struct early_res) * max_early_res);
+                early_res = NULL;
+                max_early_res = 0;
+        }
+        *rangep = range;
+        return nr_range;
+}
+#else
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+        printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+                         count - idx, max_early_res, start, end);
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
+                        r->start, r->end, r->name);
+                final_start = max(start, r->start);
+                final_end = min(end, r->end);
+                if (final_start >= final_end) {
+                        printk(KERN_CONT "\n");
+                        continue;
+                }
+                printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+                        final_start, final_end);
+                reserve_bootmem_generic(final_start, final_end - final_start,
+                                BOOTMEM_DEFAULT);
+        }
+        /* clear them */
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = NULL;
+        max_early_res = 0;
+        early_res_count = 0;
+}
+#endif
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+        int i;
+        u64 addr = *addrp;
+        int changed = 0;
+        struct early_res *r;
+again:
+        i = find_overlapped_early(addr, addr + size);
+        r = &early_res[i];
+        if (i < max_early_res && r->end) {
+                *addrp = addr = round_up(r->end, align);
+                changed = 1;
+                goto again;
+        }
+        return changed;
+}
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+        int i;
+        u64 addr = *addrp, last;
+        u64 size = *sizep;
+        int changed = 0;
+again:
+        last = addr + size;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                struct early_res *r = &early_res[i];
+                if (last > r->start && addr < r->start) {
+                        size = r->start - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last > r->end && addr < r->end) {
+                        addr = round_up(r->end, align);
+                        size = last - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last <= r->end && addr >= r->start) {
+                        (*sizep)++;
+                        return 0;
+                }
+        }
+        if (changed) {
+                *addrp = addr;
+                *sizep = size;
+        }
+        return changed;
+}
+/*
+ * Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+                         u64 size, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+                ;
+        last = addr + size;
+        if (last > ei_last)
+                goto out;
+        if (last > end)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
+u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
+                         u64 *sizep, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        *sizep = ei_last - addr;
+        while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
+                ;
+        last = addr + *sizep;
+        if (last > ei_last)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/elf.h>
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+        return 0;
+}
+int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
+                                      unsigned long limit)
+{
+        return 1;
+}
+int __weak elf_core_write_extra_data(struct file *file, size_t *size,
+                                     unsigned long limit)
+{
+        return 1;
+}
+size_t __weak elf_core_extra_data_size(void)
+{
+        return 0;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a66..7f2683a10ac4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk)
        BUG_ON(!sig);
        BUG_ON(!atomic_read(&sig->count));
-        sighand = rcu_dereference(tsk->sighand);
+        sighand = rcu_dereference_check(tsk->sighand,
+                                        rcu_read_lock_held() ||
+                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);
        posix_cpu_timers_exit(tsk);
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p)
 repeat:
        tracehook_prepare_release_task(p);
        /* don't need to get the RCU readlock here - the process is dead and
-         * can't be modifying its own credentials */
+         * can't be modifying its own credentials. But shut RCU-lockdep up */
+        rcu_read_lock();
        atomic_dec(&__task_cred(p)->user->processes);
+        rcu_read_unlock();
        proc_flush_task(p);
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files)
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
-         * files structure.
+         * files structure.  But use RCU to shut RCU-lockdep up.
         */
+        rcu_read_lock();
        fdt = files_fdtable(files);
+        rcu_read_unlock();
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files)
                 * at the end of the RCU grace period. Otherwise,
                 * you can free files immediately.
                 */
+                rcu_read_lock();
                fdt = files_fdtable(files);
                if (fdt != &files->fdtab)
                        kmem_cache_free(files_cachep, files);
                free_fdtable(fdt);
+                rcu_read_unlock();
        }
 }
@@ -944,7 +952,9 @@ NORET_TYPE void do_exit(long code)
                                preempt_count());
        acct_update_integrals(tsk);
+        /* sync mm's RSS info before statistics gathering */
+        if (tsk->mm)
+                sync_mm_rss(tsk, tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
@@ -1180,7 +1190,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        if (unlikely(wo->wo_flags & WNOWAIT)) {
                int exit_code = p->exit_code;
-                int why, status;
+                int why;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc2..44b0791b0a2e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+        return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
 int nr_processes(void)
 {
        int cpu;
@@ -328,15 +336,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
+                INIT_LIST_HEAD(&tmp->anon_vma_chain);
                pol = mpol_dup(vma_policy(mpnt));
                retval = PTR_ERR(pol);
                if (IS_ERR(pol))
                        goto fail_nomem_policy;
                vma_set_policy(tmp, pol);
+                if (anon_vma_fork(tmp, mpnt))
+                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~VM_LOCKED;
                tmp->vm_mm = mm;
                tmp->vm_next = NULL;
-                anon_vma_link(tmp);
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +401,8 @@ out:
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
        return retval;
+fail_nomem_anon_vma_fork:
+        mpol_put(pol);
 fail_nomem_policy:
        kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
@@ -454,8 +466,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
                (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
        mm->core_state = NULL;
        mm->nr_ptes = 0;
-        set_mm_counter(mm, file_rss, 0);
+        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
-        set_mm_counter(mm, anon_rss, 0);
        spin_lock_init(&mm->page_table_lock);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
@@ -824,23 +835,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 */
 static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
+        unsigned long cpu_limit;
        /* Thread group counters. */
        thread_group_cputime_init(sig);
-        /* Expiration times and increments. */
+        cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        sig->it[CPUCLOCK_PROF].expires = cputime_zero;
+        if (cpu_limit != RLIM_INFINITY) {
-        sig->it[CPUCLOCK_PROF].incr = cputime_zero;
+                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
-        sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
-        sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-        /* Cached expiration times. */
-        sig->cputime_expires.prof_exp = cputime_zero;
-        sig->cputime_expires.virt_exp = cputime_zero;
-        sig->cputime_expires.sched_exp = 0;
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-                sig->cputime_expires.prof_exp =
-                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
                sig->cputimer.running = 1;
        }
@@ -857,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        if (clone_flags & CLONE_THREAD)
                return 0;
-        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;
@@ -865,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        atomic_set(&sig->count, 1);
        atomic_set(&sig->live, 1);
        init_waitqueue_head(&sig->wait_chldexit);
-        sig->flags = 0;
        if (clone_flags & CLONE_NEWPID)
                sig->flags |= SIGNAL_UNKILLABLE;
-        sig->group_exit_code = 0;
-        sig->group_exit_task = NULL;
-        sig->group_stop_count = 0;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->leader = 0;        /* session leadership doesn't inherit */
-        sig->tty_old_pgrp = NULL;
-        sig->tty = NULL;
-        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
-        sig->gtime = cputime_zero;
-        sig->cgtime = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        sig->prev_utime = sig->prev_stime = cputime_zero;
-#endif
-        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
-        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
-        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-        sig->maxrss = sig->cmaxrss = 0;
-        task_io_accounting_init(&sig->ioac);
-        sig->sum_sched_runtime = 0;
-        taskstats_tgid_init(sig);
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
        posix_cpu_timers_init_group(sig);
-        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
@@ -1033,7 +1010,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        retval = -EAGAIN;
        if (atomic_read(&p->real_cred->user->processes) >=
-                        p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+                        task_rlimit(p, RLIMIT_NPROC)) {
                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
@@ -1075,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->prev_utime = cputime_zero;
        p->prev_stime = cputime_zero;
 #endif
+#if defined(SPLIT_RSS_COUNTING)
+        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#endif
        p->default_timer_slack_ns = current->timer_slack_ns;
@@ -1241,21 +1221,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
-        /*
-         * The task hasn't been attached yet, so its cpus_allowed mask will
-         * not be changed, nor will its assigned CPU.
-         *
-         * The cpus_allowed mask of the parent may have changed after it was
-         * copied first time - so re-copy it here, then check the child's CPU
-         * to ensure it is on a valid CPU (and if not, just force it back to
-         * parent's CPU). This avoids alot of nasty races.
-         */
-        p->cpus_allowed = current->cpus_allowed;
-        p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
-        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-                        !cpu_online(task_cpu(p))))
-                set_task_cpu(p, smp_processor_id());
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9d..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                                return -EINVAL;
                        WARN_ON(!atomic_read(&pi_state->refcount));
-                        WARN_ON(pid && pi_state->owner &&
-                                pi_state->owner->pid != pid);
+                        /*
+                         * When pi_state->owner is NULL then the owner died
+                         * and another waiter is on the fly. pi_state->owner
+                         * is fixed up by the task which acquires
+                         * pi_state->rt_mutex.
+                         *
+                         * We do not check for pid == 0 which can happen when
+                         * the owner died and robust_list_exit() cleared the
+                         * TID.
+                         */
+                        if (pid && pi_state->owner) {
+                                /*
+                                 * Bail out if user space manipulated the
+                                 * futex value.
+                                 */
+                                if (pid != task_pid_vnr(pi_state->owner))
+                                        return -EINVAL;
+                        }
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
@@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!pi_state)
                return -EINVAL;
+        /*
+         * If current does not own the pi_state then the futex is
+         * inconsistent and user space fiddled with the futex value.
+         */
+        if (pi_state->owner != current)
+                return -EINVAL;
        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
@@ -1971,7 +1995,7 @@ retry_private:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-        goto out;
+        goto out_put_key;
 out_unlock_put_key:
        queue_unlock(&q, hb);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                struct task_struct *p;
                ret = -ESRCH;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                    !capable(CAP_SYS_PTRACE))
                        goto err_unlock;
                head = p->compat_robust_list;
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        }
        if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
        return put_user(ptr_to_compat(head), head_ptr);
 err_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index c030ae657f20..03808ed342a6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
 *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
 */
-int reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp)
 {
        struct bp_busy_slots slots = {0};
-        int ret = 0;
-        mutex_lock(&nr_bp_mutex);
        fetch_bp_busy_slots(&slots, bp);
        /* Flexible counters need to keep at least one slot */
-        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+        if (slots.pinned + (!!slots.flexible) == HBP_NUM)
-                ret = -ENOSPC;
+                return -ENOSPC;
-                goto end;
-        }
        toggle_bp_slot(bp, true);
-end:
+        return 0;
+}
+int reserve_bp_slot(struct perf_event *bp)
+{
+        int ret;
+        mutex_lock(&nr_bp_mutex);
+        ret = __reserve_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
        return ret;
 }
+static void __release_bp_slot(struct perf_event *bp)
+{
+        toggle_bp_slot(bp, false);
+}
 void release_bp_slot(struct perf_event *bp)
 {
        mutex_lock(&nr_bp_mutex);
-        toggle_bp_slot(bp, false);
+        __release_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
 }
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        return __reserve_bp_slot(bp);
+}
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        __release_bp_slot(bp);
+        return 0;
+}
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
@@ -328,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
        u64 old_addr = bp->attr.bp_addr;
+        u64 old_len = bp->attr.bp_len;
        int old_type = bp->attr.bp_type;
-        int old_len = bp->attr.bp_len;
        int err = 0;
        perf_event_disable(bp);
@@ -381,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 *
 * @return a set of per_cpu pointers to perf events
 */
-struct perf_event **
+struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered)
 {
-        struct perf_event **cpu_events, **pevent, *bp;
+        struct perf_event * __percpu *cpu_events, **pevent, *bp;
        long err;
        int cpu;
        cpu_events = alloc_percpu(typeof(*cpu_events));
        if (!cpu_events)
-                return ERR_PTR(-ENOMEM);
+                return (void __percpu __force *)ERR_PTR(-ENOMEM);
        get_online_cpus();
        for_each_online_cpu(cpu) {
@@ -419,7 +451,7 @@ fail:
        put_online_cpus();
        free_percpu(cpu_events);
-        return ERR_PTR(err);
+        return (void __percpu __force *)ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -427,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
 * @cpu_events: the per cpu set of events to unregister
 */
-void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
 {
        int cpu;
        struct perf_event **pevent;
@@ -457,5 +489,4 @@ struct pmu perf_ops_bp = {
        .enable         = arch_install_hw_breakpoint,
        .disable        = arch_uninstall_hw_breakpoint,
        .read           = hw_breakpoint_pmu_read,
-        .unthrottle     = hw_breakpoint_pmu_unthrottle
 };
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
 #include "internals.h"
-/**
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
- *      dynamic_irq_init - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
 {
        struct irq_desc *desc;
        unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
        desc->depth = 1;
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->action = NULL;
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
 }
 /**
- *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      dynamic_irq_init - initialize a dynamically allocated irq
 *      @irq:   irq number to initialize
 */
-void dynamic_irq_cleanup(unsigned int irq)
+void dynamic_irq_init(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, false);
+}
+/**
+ *      dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, true);
+}
+static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
        }
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->handle_irq = handle_bad_irq;
        desc->chip = &no_irq_chip;
        desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
+/**
+ *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, false);
+}
+/**
+ *      dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, true);
+}
 /**
 *      set_irq_chip - set the irq chip for an irq
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
                if (desc->chip->ack)
                        desc->chip->ack(irq);
        }
+        desc->status |= IRQ_MASKED;
+}
+static inline void mask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->mask) {
+                desc->chip->mask(irq);
+                desc->status |= IRQ_MASKED;
+        }
+}
+static inline void unmask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->unmask) {
+                desc->chip->unmask(irq);
+                desc->status &= ~IRQ_MASKED;
+        }
 }
 /*
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
-        if (unlikely(desc->status & IRQ_ONESHOT))
+        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
-                desc->status |= IRQ_MASKED;
+                unmask_irq(desc, irq);
-        else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-                desc->chip->unmask(irq);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        action = desc->action;
        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                desc->status |= IRQ_PENDING;
-                if (desc->chip->mask)
+                mask_irq(desc, irq);
-                        desc->chip->mask(irq);
                goto out;
        }
@@ -520,7 +568,7 @@ out:
 *      signal. The occurence is latched into the irq controller hardware
 *      and must be acked in order to be reenabled. After the ack another
 *      interrupt can happen on the same source even before the first one
- *      is handled by the assosiacted event handler. If this happens it
+ *      is handled by the associated event handler. If this happens it
 *      might be necessary to disable (mask) the interrupt depending on the
 *      controller hardware. This requires to reenable the interrupt inside
 *      of the loop which handles the interrupts which have arrived while
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                irqreturn_t action_ret;
                if (unlikely(!action)) {
-                        desc->chip->mask(irq);
+                        mask_irq(desc, irq);
                        goto out_unlock;
                }
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                if (unlikely((desc->status &
                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
                              (IRQ_PENDING | IRQ_MASKED))) {
-                        desc->chip->unmask(irq);
+                        unmask_irq(desc, irq);
-                        desc->status &= ~IRQ_MASKED;
                }
                desc->status &= ~IRQ_PENDING;
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        __set_irq_handler(irq, handle, 0, name);
 }
-void __init set_irq_noprobe(unsigned int irq)
+void set_irq_noprobe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
-void __init set_irq_probe(unsigned int irq)
+void set_irq_probe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 *      automatically freed on driver detach.
 *
 *      If an IRQ allocated with this function needs to be freed
- *      separately, dev_free_irq() must be used.
+ *      separately, devm_free_irq() must be used.
 */
 int devm_request_threaded_irq(struct device *dev, unsigned int irq,
                              irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
 *      Except for the extra @dev argument, this function takes the
 *      same arguments and performs the same function as free_irq().
 *      This function instead of free_irq() should be used to manually
- *      free IRQs allocated with dev_request_irq().
+ *      free IRQs allocated with devm_request_irq().
 */
 void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 #include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
        void *ptr;
-        if (slab_is_available())
+        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                           GFP_ATOMIC, node);
-                                   GFP_ATOMIC, node);
-        else
-                ptr = alloc_bootmem_node(NODE_DATA(node),
-                                nr * sizeof(*desc->kstat_irqs));
        /*
         * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 */
 DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-struct irq_desc **irq_desc_ptrs __read_mostly;
+static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
+static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return radix_tree_lookup(&irq_desc_tree, irq);
+}
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        void **ptr;
+        ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
+        if (ptr)
+                radix_tree_replace_slot(ptr, desc);
+}
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
        legacy_count = ARRAY_SIZE(irq_desc_legacy);
        node = first_online_node;
-        /* allocate irq_desc_ptrs array based on nr_irqs */
-        irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
        /* allocate based on nr_cpu_ids */
        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
                                          sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
                alloc_desc_masks(&desc[i], node, true);
                init_desc_masks(&desc[i]);
-                irq_desc_ptrs[i] = desc + i;
+                set_irq_desc(i, &desc[i]);
        }
-        for (i = legacy_count; i < nr_irqs; i++)
-                irq_desc_ptrs[i] = NULL;
        return arch_early_irq_init();
 }
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        if (irq_desc_ptrs && irq < nr_irqs)
-                return irq_desc_ptrs[irq];
-        return NULL;
-}
 struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
                return NULL;
        }
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                return desc;
        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                goto out_unlock;
-        if (slab_is_available())
+        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-                desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        else
-                desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
        if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
        }
        init_one_irq_desc(irq, desc, node);
-        irq_desc_ptrs[irq] = desc;
+        set_irq_desc(irq, desc);
 out_unlock:
        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
 extern raw_spinlock_t sparse_irq_lock;
 #ifdef CONFIG_SPARSE_IRQ
-/* irq_desc_ptrs allocated at boot time */
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
-extern struct irq_desc **irq_desc_ptrs;
-#else
-/* irq_desc_ptrs is a fixed size array */
-extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
 #endif
 #ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..704e488730a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action;
+        unsigned long flags;
        if (!desc)
                return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
        if (desc->status & IRQ_NOREQUEST)
                return 0;
+        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        if (action)
                if (irqflags & action->flags & IRQF_SHARED)
                        action = NULL;
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return !action;
 }
@@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+again:
        chip_bus_lock(irq, desc);
        raw_spin_lock_irq(&desc->lock);
+        /*
+         * Implausible though it may be we need to protect us against
+         * the following scenario:
+         *
+         * The thread is faster done than the hard interrupt handler
+         * on the other CPU. If we unmask the irq line then the
+         * interrupt can come in again and masks the line, leaves due
+         * to IRQ_INPROGRESS and the irq line is masked forever.
+         */
+        if (unlikely(desc->status & IRQ_INPROGRESS)) {
+                raw_spin_unlock_irq(&desc->lock);
+                chip_bus_sync_unlock(irq, desc);
+                cpu_relax();
+                goto again;
+        }
        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                desc->status &= ~IRQ_MASKED;
                desc->chip->unmask(irq);
@@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                if (new->flags & IRQF_ONESHOT)
                        desc->status |= IRQ_ONESHOT;
+                /*
+                 * Force MSI interrupts to run with interrupts
+                 * disabled. The multi vector cards can cause stack
+                 * overflows due to nested interrupts when enough of
+                 * them are directed to a core and fire at the same
+                 * time.
+                 */
+                if (desc->msi_desc)
+                        new->flags |= IRQF_DISABLED;
                if (!(desc->status & IRQ_NOAUTOEN)) {
                        desc->depth = 0;
                        desc->status &= ~IRQ_DISABLED;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
 */
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
@@ -70,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc && old_desc != desc)
                goto out_unlock;
@@ -90,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                goto out_unlock;
        }
-        irq_desc_ptrs[irq] = desc;
+        replace_irq_desc(irq, desc);
        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        /* free the old one */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..7a6eb04ef6b5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/irq.h>
+#include <linux/gfp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a355..13aff293f4de 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>        /* for cond_resched */
 #include <linux/mm.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <asm/sections.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
 #include <asm/sections.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t* crash_notes;
+note_buf_t __percpu *crash_notes;
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 32c5c15d750d..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
        buffer = kmalloc(size, gfp_mask);
        if (!buffer) {
-                _kfifo_init(fifo, 0, 0);
+                _kfifo_init(fifo, NULL, 0);
                return -ENOMEM;
        }
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
 void kfifo_free(struct kfifo *fifo)
 {
        kfree(fifo->buffer);
+        _kfifo_init(fifo, NULL, 0);
 }
 EXPORT_SYMBOL(kfifo_free);
@@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n);
 * @fifo: the fifo to be used.
 * @from: pointer to the data to be added.
 * @len: the length of the data to be added.
+ * @total: the actual returned data length.
 *
 * This function copies at most @len bytes from the @from into the
 * FIFO depending and returns -EFAULT/0.
@@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n);
 * @fifo: the fifo to be used.
 * @to: where the data must be copied.
 * @len: the size of the destination buffer.
- @ @lenout: pointer to output variable with copied data
+ * @lenout: pointer to output variable with copied data
 *
 * This function copies at most @len bytes from the FIFO into the
 * @to buffer and 0 or -EFAULT.
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e23514..11f3515ca83f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -69,9 +69,16 @@ struct kgdb_state {
        struct pt_regs          *linux_regs;
 };
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP       0x8 /* CPU is single stepping */
 static struct debuggerinfo_struct {
        void                    *debuggerinfo;
        struct task_struct      *task;
+        int                     exception_state;
 } kgdb_info[NR_CPUS];
 /**
@@ -391,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
 /*
 * Copy the binary array pointed to by buf into mem.  Fix $, #, and
- * 0x7d escaped with 0x7d.  Return a pointer to the character after
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * the last byte written.
+ * The input buf is overwitten with the result to write to mem.
 */
 static int kgdb_ebin2mem(char *buf, char *mem, int count)
 {
-        int err = 0;
+        int size = 0;
-        char c;
+        char *c = buf;
        while (count-- > 0) {
-                c = *buf++;
+                c[size] = *buf++;
-                if (c == 0x7d)
+                if (c[size] == 0x7d)
-                        c = *buf++ ^ 0x20;
+                        c[size] = *buf++ ^ 0x20;
+                size++;
-                err = probe_kernel_write(mem, &c, 1);
-                if (err)
-                        break;
-                mem++;
        }
-        return err;
+        return probe_kernel_write(mem, c, size);
 }
 /*
@@ -563,46 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
 }
 /*
- * CPU debug state control:
- */
-#ifdef CONFIG_SMP
-static void kgdb_wait(struct pt_regs *regs)
-{
-        unsigned long flags;
-        int cpu;
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        kgdb_info[cpu].debuggerinfo = regs;
-        kgdb_info[cpu].task = current;
-        /*
-         * Make sure the above info reaches the primary CPU before
-         * our cpu_in_kgdb[] flag setting does:
-         */
-        smp_wmb();
-        atomic_set(&cpu_in_kgdb[cpu], 1);
-        /* Wait till primary CPU is done with debugging */
-        while (atomic_read(&passive_cpu_wait[cpu]))
-                cpu_relax();
-        kgdb_info[cpu].debuggerinfo = NULL;
-        kgdb_info[cpu].task = NULL;
-        /* fix up hardware debug registers on local cpu */
-        if (arch_kgdb_ops.correct_hw_break)
-                arch_kgdb_ops.correct_hw_break();
-        /* Signal the primary CPU that we are done: */
-        atomic_set(&cpu_in_kgdb[cpu], 0);
-        touch_softlockup_watchdog();
-        clocksource_touch_watchdog();
-        local_irq_restore(flags);
-}
-#endif
-/*
 * Some architectures need cache flushes when we set/clear a
 * breakpoint:
 */
@@ -1397,34 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
        return 1;
 }
-/*
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
- * kgdb_handle_exception() - main entry point from a kernel exception
- *
- * Locking hierarchy:
- *      interface locks, if any (begin_session)
- *      kgdb lock (kgdb_active)
- */
-int
-kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 {
-        struct kgdb_state kgdb_var;
-        struct kgdb_state *ks = &kgdb_var;
        unsigned long flags;
        int sstep_tries = 100;
        int error = 0;
        int i, cpu;
+        int trace_on = 0;
-        ks->cpu                 = raw_smp_processor_id();
-        ks->ex_vector           = evector;
-        ks->signo               = signo;
-        ks->ex_vector           = evector;
-        ks->err_code            = ecode;
-        ks->kgdb_usethreadid    = 0;
-        ks->linux_regs          = regs;
-        if (kgdb_reenter_check(ks))
-                return 0; /* Ouch, double exception ! */
 acquirelock:
        /*
         * Interrupts will be restored by the 'trap return' code, except when
@@ -1432,13 +1373,43 @@ acquirelock:
         */
        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
+        cpu = ks->cpu;
+        kgdb_info[cpu].debuggerinfo = regs;
+        kgdb_info[cpu].task = current;
+        /*
+         * Make sure the above info reaches the primary CPU before
+         * our cpu_in_kgdb[] flag setting does:
+         */
+        atomic_inc(&cpu_in_kgdb[cpu]);
        /*
-         * Acquire the kgdb_active lock:
+         * CPU will loop if it is a slave or request to become a kgdb
+         * master cpu and acquire the kgdb_active lock:
         */
-        while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
+        while (1) {
+                if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+                        if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+                                break;
+                } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+                        if (!atomic_read(&passive_cpu_wait[cpu]))
+                                goto return_normal;
+                } else {
+return_normal:
+                        /* Return to normal operation by executing any
+                         * hw breakpoint fixup.
+                         */
+                        if (arch_kgdb_ops.correct_hw_break)
+                                arch_kgdb_ops.correct_hw_break();
+                        if (trace_on)
+                                tracing_on();
+                        atomic_dec(&cpu_in_kgdb[cpu]);
+                        touch_softlockup_watchdog_sync();
+                        clocksource_touch_watchdog();
+                        local_irq_restore(flags);
+                        return 0;
+                }
                cpu_relax();
+        }
        /*
         * For single stepping, try to only enter on the processor
@@ -1450,7 +1421,7 @@ acquirelock:
            (kgdb_info[cpu].task &&
             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
-                touch_softlockup_watchdog();
+                touch_softlockup_watchdog_sync();
                clocksource_touch_watchdog();
                local_irq_restore(flags);
@@ -1472,9 +1443,6 @@ acquirelock:
        if (kgdb_io_ops->pre_exception)
                kgdb_io_ops->pre_exception();
-        kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
-        kgdb_info[ks->cpu].task = current;
        kgdb_disable_hw_debug(ks->linux_regs);
        /*
@@ -1483,15 +1451,9 @@ acquirelock:
         */
        if (!kgdb_single_step) {
                for (i = 0; i < NR_CPUS; i++)
-                        atomic_set(&passive_cpu_wait[i], 1);
+                        atomic_inc(&passive_cpu_wait[i]);
        }
-        /*
-         * spin_lock code is good enough as a barrier so we don't
-         * need one here:
-         */
-        atomic_set(&cpu_in_kgdb[ks->cpu], 1);
 #ifdef CONFIG_SMP
        /* Signal the other CPUs to enter kgdb_wait() */
        if ((!kgdb_single_step) && kgdb_do_roundup)
@@ -1515,6 +1477,9 @@ acquirelock:
        kgdb_single_step = 0;
        kgdb_contthread = current;
        exception_level = 0;
+        trace_on = tracing_is_on();
+        if (trace_on)
+                tracing_off();
        /* Talk to debugger with gdbserial protocol */
        error = gdb_serial_stub(ks);
@@ -1523,13 +1488,11 @@ acquirelock:
        if (kgdb_io_ops->post_exception)
                kgdb_io_ops->post_exception();
-        kgdb_info[ks->cpu].debuggerinfo = NULL;
+        atomic_dec(&cpu_in_kgdb[ks->cpu]);
-        kgdb_info[ks->cpu].task = NULL;
-        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
        if (!kgdb_single_step) {
                for (i = NR_CPUS-1; i >= 0; i--)
-                        atomic_set(&passive_cpu_wait[i], 0);
+                        atomic_dec(&passive_cpu_wait[i]);
                /*
                 * Wait till all the CPUs have quit
                 * from the debugger.
@@ -1548,22 +1511,63 @@ kgdb_restore:
                else
                        kgdb_sstep_pid = 0;
        }
+        if (trace_on)
+                tracing_on();
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
-        touch_softlockup_watchdog();
+        touch_softlockup_watchdog_sync();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
        return error;
 }
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ *      interface locks, if any (begin_session)
+ *      kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        int ret;
+        ks->cpu                 = raw_smp_processor_id();
+        ks->ex_vector           = evector;
+        ks->signo               = signo;
+        ks->ex_vector           = evector;
+        ks->err_code            = ecode;
+        ks->kgdb_usethreadid    = 0;
+        ks->linux_regs          = regs;
+        if (kgdb_reenter_check(ks))
+                return 0; /* Ouch, double exception ! */
+        kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+        ret = kgdb_cpu_enter(ks, regs);
+        kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
+        return ret;
+}
 int kgdb_nmicallback(int cpu, void *regs)
 {
 #ifdef CONFIG_SMP
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        memset(ks, 0, sizeof(struct kgdb_state));
+        ks->cpu                 = cpu;
+        ks->linux_regs          = regs;
        if (!atomic_read(&cpu_in_kgdb[cpu]) &&
-                        atomic_read(&kgdb_active) != cpu &&
+            atomic_read(&kgdb_active) != -1 &&
-                        atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
+            atomic_read(&kgdb_active) != cpu) {
-                kgdb_wait((struct pt_regs *)regs);
+                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+                kgdb_cpu_enter(ks, regs);
+                kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
                return 0;
        }
 #endif
@@ -1739,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 */
 void kgdb_breakpoint(void)
 {
-        atomic_set(&kgdb_setting_breakpoint, 1);
+        atomic_inc(&kgdb_setting_breakpoint);
        wmb(); /* Sync point before breakpoint */
        arch_kgdb_breakpoint();
        wmb(); /* Sync point after breakpoint */
-        atomic_set(&kgdb_setting_breakpoint, 0);
+        atomic_dec(&kgdb_setting_breakpoint);
 }
 EXPORT_SYMBOL_GPL(kgdb_breakpoint);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ccec774c716d..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,9 +42,11 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/sysctl.h>
 #include <linux/kdebug.h>
 #include <linux/memory.h>
 #include <linux/ftrace.h>
+#include <linux/cpu.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
-#define INSNS_PER_PAGE  (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
 struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;         /* Page of instruction slots */
-        char slot_used[INSNS_PER_PAGE];
        int nused;
        int ngarbage;
+        char slot_used[];
+};
+#define KPROBE_INSN_PAGE_SIZE(slots)                    \
+        (offsetof(struct kprobe_insn_page, slot_used) + \
+         (sizeof(char) * (slots)))
+struct kprobe_insn_cache {
+        struct list_head pages; /* list of kprobe_insn_page */
+        size_t insn_size;       /* size of instruction slot */
+        int nr_garbage;
 };
+static int slots_per_page(struct kprobe_insn_cache *c)
+{
+        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
+}
 enum kprobe_slot_state {
        SLOT_CLEAN = 0,
        SLOT_DIRTY = 1,
        SLOT_USED = 2,
 };
-static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
+static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
-static LIST_HEAD(kprobe_insn_pages);
+static struct kprobe_insn_cache kprobe_insn_slots = {
-static int kprobe_garbage_slots;
+        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
-static int collect_garbage_slots(void);
+        .insn_size = MAX_INSN_SIZE,
+        .nr_garbage = 0,
+};
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
 /**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-static kprobe_opcode_t __kprobes *__get_insn_slot(void)
+static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip;
 retry:
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+        list_for_each_entry(kip, &c->pages, list) {
-                if (kip->nused < INSNS_PER_PAGE) {
+                if (kip->nused < slots_per_page(c)) {
                        int i;
-                        for (i = 0; i < INSNS_PER_PAGE; i++) {
+                        for (i = 0; i < slots_per_page(c); i++) {
                                if (kip->slot_used[i] == SLOT_CLEAN) {
                                        kip->slot_used[i] = SLOT_USED;
                                        kip->nused++;
-                                        return kip->insns + (i * MAX_INSN_SIZE);
+                                        return kip->insns + (i * c->insn_size);
                                }
                        }
-                        /* Surprise!  No unused slots.  Fix kip->nused. */
+                        /* kip->nused is broken. Fix it. */
-                        kip->nused = INSNS_PER_PAGE;
+                        kip->nused = slots_per_page(c);
+                        WARN_ON(1);
                }
        }
        /* If there are any garbage slots, collect it and try again. */
-        if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+        if (c->nr_garbage && collect_garbage_slots(c) == 0)
                goto retry;
-        }
-        /* All out of space.  Need to allocate a new page. Use slot 0. */
+        /* All out of space.  Need to allocate a new page. */
-        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+        kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
        if (!kip)
                return NULL;
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
                return NULL;
        }
        INIT_LIST_HEAD(&kip->list);
-        list_add(&kip->list, &kprobe_insn_pages);
+        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
-        memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
+        list_add(&kip->list, &c->pages);
        return kip->insns;
 }
 kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
-        kprobe_opcode_t *ret;
+        kprobe_opcode_t *ret = NULL;
        mutex_lock(&kprobe_insn_mutex);
-        ret = __get_insn_slot();
+        ret = __get_insn_slot(&kprobe_insn_slots);
        mutex_unlock(&kprobe_insn_mutex);
        return ret;
 }
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
                 * so as not to have to set it up again the
                 * next time somebody inserts a probe.
                 */
-                if (!list_is_singular(&kprobe_insn_pages)) {
+                if (!list_is_singular(&kip->list)) {
                        list_del(&kip->list);
                        module_free(NULL, kip->insns);
                        kfree(kip);
@@ -209,51 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
        return 0;
 }
-static int __kprobes collect_garbage_slots(void)
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip, *next;
        /* Ensure no-one is interrupted on the garbages */
        synchronize_sched();
-        list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
+        list_for_each_entry_safe(kip, next, &c->pages, list) {
                int i;
                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;      /* we will collect all garbages */
-                for (i = 0; i < INSNS_PER_PAGE; i++) {
+                for (i = 0; i < slots_per_page(c); i++) {
                        if (kip->slot_used[i] == SLOT_DIRTY &&
                            collect_one_slot(kip, i))
                                break;
                }
        }
-        kprobe_garbage_slots = 0;
+        c->nr_garbage = 0;
        return 0;
 }
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+                                       kprobe_opcode_t *slot, int dirty)
 {
        struct kprobe_insn_page *kip;
-        mutex_lock(&kprobe_insn_mutex);
+        list_for_each_entry(kip, &c->pages, list) {
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+                long idx = ((long)slot - (long)kip->insns) /
-                if (kip->insns <= slot &&
+                                (c->insn_size * sizeof(kprobe_opcode_t));
-                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
+                if (idx >= 0 && idx < slots_per_page(c)) {
-                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
+                        WARN_ON(kip->slot_used[idx] != SLOT_USED);
                        if (dirty) {
-                                kip->slot_used[i] = SLOT_DIRTY;
+                                kip->slot_used[idx] = SLOT_DIRTY;
                                kip->ngarbage++;
+                                if (++c->nr_garbage > slots_per_page(c))
+                                        collect_garbage_slots(c);
                        } else
-                                collect_one_slot(kip, i);
+                                collect_one_slot(kip, idx);
-                        break;
+                        return;
                }
        }
+        /* Could not free this slot. */
+        WARN_ON(1);
+}
-        if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
-                collect_garbage_slots();
+{
+        mutex_lock(&kprobe_insn_mutex);
+        __free_insn_slot(&kprobe_insn_slots, slot, dirty);
        mutex_unlock(&kprobe_insn_mutex);
 }
+#ifdef CONFIG_OPTPROBES
+/* For optimized_kprobe buffer */
+static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
+static struct kprobe_insn_cache kprobe_optinsn_slots = {
+        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
+        /* .insn_size is initialized later */
+        .nr_garbage = 0,
+};
+/* Get a slot for optimized_kprobe buffer */
+kprobe_opcode_t __kprobes *get_optinsn_slot(void)
+{
+        kprobe_opcode_t *ret = NULL;
+        mutex_lock(&kprobe_optinsn_mutex);
+        ret = __get_insn_slot(&kprobe_optinsn_slots);
+        mutex_unlock(&kprobe_optinsn_mutex);
+        return ret;
+}
+void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
+{
+        mutex_lock(&kprobe_optinsn_mutex);
+        __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
+        mutex_unlock(&kprobe_optinsn_mutex);
+}
+#endif
 #endif
 /* We have preemption disabled.. so it is safe to use __ versions */
@@ -284,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
                if (p->addr == addr)
                        return p;
        }
+        return NULL;
+}
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+/* Return true if the kprobe is an aggregator */
+static inline int kprobe_aggrprobe(struct kprobe *p)
+{
+        return p->pre_handler == aggr_pre_handler;
+}
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+#ifdef CONFIG_OPTPROBES
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_allow_optimization;
+/*
+ * Call all pre_handler on the list, but ignores its return value.
+ * This must be called from arch-dep optimized caller.
+ */
+void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &p->list, list) {
+                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+                        set_kprobe_instance(kp);
+                        kp->pre_handler(kp, regs);
+                }
+                reset_kprobe_instance();
+        }
+}
+/* Return true(!0) if the kprobe is ready for optimization. */
+static inline int kprobe_optready(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                return arch_prepared_optinsn(&op->optinsn);
+        }
+        return 0;
+}
+/*
+ * Return an optimized kprobe whose optimizing code replaces
+ * instructions including addr (exclude breakpoint).
+ */
+struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+{
+        int i;
+        struct kprobe *p = NULL;
+        struct optimized_kprobe *op;
+        /* Don't check i == 0, since that is a breakpoint case. */
+        for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
+                p = get_kprobe((void *)(addr - i));
+        if (p && kprobe_optready(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (arch_within_optimized_kprobe(op, addr))
+                        return p;
+        }
        return NULL;
 }
+/* Optimization staging list, protected by kprobe_mutex */
+static LIST_HEAD(optimizing_list);
+static void kprobe_optimizer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+#define OPTIMIZE_DELAY 5
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        if (kprobes_all_disarmed || !kprobes_allow_optimization)
+                goto end;
+        /*
+         * Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /*
+         * The optimization/unoptimization refers online_cpus via
+         * stop_machine() and cpu-hotplug modifies online_cpus.
+         * And same time, text_mutex will be held in cpu-hotplug and here.
+         * This combination can cause a deadlock (cpu-hotplug try to lock
+         * text_mutex but stop_machine can not be done because online_cpus
+         * has been changed)
+         * To avoid this deadlock, we need to call get_online_cpus()
+         * for preventing cpu-hotplug outside of text_mutex locking.
+         */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+                WARN_ON(kprobe_disabled(&op->kp));
+                if (arch_optimize_kprobe(op) < 0)
+                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+                list_del_init(&op->list);
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+end:
+        mutex_unlock(&kprobe_mutex);
+        mutex_unlock(&module_mutex);
+}
+/* Optimize kprobe if p is ready to be optimized */
+static __kprobes void optimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* Check if the kprobe is disabled or not ready for optimization. */
+        if (!kprobe_optready(p) || !kprobes_allow_optimization ||
+            (kprobe_disabled(p) || kprobes_all_disarmed))
+                return;
+        /* Both of break_handler and post_handler are not supported. */
+        if (p->break_handler || p->post_handler)
+                return;
+        op = container_of(p, struct optimized_kprobe, kp);
+        /* Check there is no other kprobes at the optimized instructions */
+        if (arch_check_optimized_kprobe(op) < 0)
+                return;
+        /* Check if it is already optimized. */
+        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+                return;
+        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
+        list_add(&op->list, &optimizing_list);
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Unoptimize a kprobe if p is optimized */
+static __kprobes void unoptimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        /* Dequeue from the optimization queue */
+                        list_del_init(&op->list);
+                else
+                        /* Replace jump with break */
+                        arch_unoptimize_kprobe(op);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+}
+/* Remove optimized instructions */
+static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        /* Don't unoptimize, because the target code will be freed. */
+        arch_remove_optimized_kprobe(op);
+}
+/* Try to prepare optimized instructions */
+static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_prepare_optimized_kprobe(op);
+}
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        kfree(op);
+}
+/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
+        if (!op)
+                return NULL;
+        INIT_LIST_HEAD(&op->list);
+        op->kp.addr = p->addr;
+        arch_prepare_optimized_kprobe(op);
+        return &op->kp;
+}
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+/*
+ * Prepare an optimized_kprobe and optimize it
+ * NOTE: p must be a normal registered kprobe
+ */
+static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+{
+        struct kprobe *ap;
+        struct optimized_kprobe *op;
+        ap = alloc_aggr_kprobe(p);
+        if (!ap)
+                return;
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (!arch_prepared_optinsn(&op->optinsn)) {
+                /* If failed to setup optimizing, fallback to kprobe */
+                free_aggr_kprobe(ap);
+                return;
+        }
+        init_aggr_kprobe(ap, p);
+        optimize_kprobe(ap);
+}
+#ifdef CONFIG_SYSCTL
+static void __kprobes optimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already allowed, just return */
+        if (kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = true;
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist)
+                        if (!kprobe_disabled(p))
+                                optimize_kprobe(p);
+        }
+        mutex_unlock(&text_mutex);
+        printk(KERN_INFO "Kprobes globally optimized\n");
+}
+static void __kprobes unoptimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already prohibited, just return */
+        if (!kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = false;
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                        if (!kprobe_disabled(p))
+                                unoptimize_kprobe(p);
+                }
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+        /* Allow all currently running kprobes to complete */
+        synchronize_sched();
+}
+int sysctl_kprobes_optimization;
+int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
+                                      void __user *buffer, size_t *length,
+                                      loff_t *ppos)
+{
+        int ret;
+        mutex_lock(&kprobe_mutex);
+        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
+        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (sysctl_kprobes_optimization)
+                optimize_all_kprobes();
+        else
+                unoptimize_all_kprobes();
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
+#endif /* CONFIG_SYSCTL */
+static void __kprobes __arm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        /* Check collision with other optimized kprobes */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+        arch_arm_kprobe(p);
+        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
+}
+static void __kprobes __disarm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        unoptimize_kprobe(p);   /* Try to unoptimize */
+        arch_disarm_kprobe(p);
+        /* If another kprobe was blocked, optimize it. */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                optimize_kprobe(old_p);
+}
+#else /* !CONFIG_OPTPROBES */
+#define optimize_kprobe(p)                      do {} while (0)
+#define unoptimize_kprobe(p)                    do {} while (0)
+#define kill_optimized_kprobe(p)                do {} while (0)
+#define prepare_optimized_kprobe(p)             do {} while (0)
+#define try_to_optimize_kprobe(p)               do {} while (0)
+#define __arm_kprobe(p)                         arch_arm_kprobe(p)
+#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        kfree(p);
+}
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+}
+#endif /* CONFIG_OPTPROBES */
 /* Arm a kprobe with text_mutex */
 static void __kprobes arm_kprobe(struct kprobe *kp)
 {
+        /*
+         * Here, since __arm_kprobe() doesn't use stop_machine(),
+         * this doesn't cause deadlock on text_mutex. So, we don't
+         * need get_online_cpus().
+         */
        mutex_lock(&text_mutex);
-        arch_arm_kprobe(kp);
+        __arm_kprobe(kp);
        mutex_unlock(&text_mutex);
 }
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
        mutex_lock(&text_mutex);
-        arch_disarm_kprobe(kp);
+        __disarm_kprobe(kp);
        mutex_unlock(&text_mutex);
+        put_online_cpus();
 }
 /*
@@ -369,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 {
        struct kprobe *kp;
-        if (p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(p)) {
                p->nmissed++;
        } else {
                list_for_each_entry_rcu(kp, &p->list, list)
@@ -493,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 }
 /*
- * Keep all fields in the kprobe consistent
- */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
-{
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
-}
-/*
 * Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
+        if (p->break_handler || p->post_handler)
+                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
                        return -EEXIST;
@@ -522,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->flags &= ~KPROBE_FLAG_DISABLED;
                if (!kprobes_all_disarmed)
                        /* Arm the breakpoint again. */
-                        arm_kprobe(ap);
+                        __arm_kprobe(ap);
        }
        return 0;
 }
@@ -531,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
-static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+        /* Copy p's insn slot to ap */
        copy_kprobe(p, ap);
        flush_insn_slot(ap);
        ap->addr = p->addr;
-        ap->flags = p->flags;
+        ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
        /* We don't care the kprobe which has gone. */
@@ -546,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->break_handler = aggr_break_handler;
        INIT_LIST_HEAD(&ap->list);
-        list_add_rcu(&p->list, &ap->list);
+        INIT_HLIST_NODE(&ap->hlist);
+        list_add_rcu(&p->list, &ap->list);
        hlist_replace_rcu(&p->hlist, &ap->hlist);
 }
@@ -561,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        int ret = 0;
        struct kprobe *ap = old_p;
-        if (old_p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(old_p)) {
-                /* If old_p is not an aggr_probe, create new aggr_kprobe. */
+                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+                ap = alloc_aggr_kprobe(old_p);
                if (!ap)
                        return -ENOMEM;
-                add_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, old_p);
        }
        if (kprobe_gone(ap)) {
@@ -585,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                         */
                        return ret;
+                /* Prepare optimized instructions if possible. */
+                prepare_optimized_kprobe(ap);
                /*
                 * Clear gone flag to prevent allocating new slot again, and
                 * set disabled flag because it is not armed yet.
@@ -593,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                            | KPROBE_FLAG_DISABLED;
        }
+        /* Copy ap's insn slot to p */
        copy_kprobe(ap, p);
        return add_new_kprobe(ap, p);
 }
@@ -743,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
        mutex_lock(&kprobe_mutex);
+        get_online_cpus();      /* For avoiding text_mutex deadlock. */
+        mutex_lock(&text_mutex);
        old_p = get_kprobe(p->addr);
        if (old_p) {
+                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);
                goto out;
        }
-        mutex_lock(&text_mutex);
        ret = arch_prepare_kprobe(p);
        if (ret)
-                goto out_unlock_text;
+                goto out;
        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        if (!kprobes_all_disarmed && !kprobe_disabled(p))
-                arch_arm_kprobe(p);
+                __arm_kprobe(p);
+        /* Try to optimize kprobe */
+        try_to_optimize_kprobe(p);
-out_unlock_text:
-        mutex_unlock(&text_mutex);
 out:
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        if (probed_mod)
@@ -785,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                return -EINVAL;
        if (old_p == p ||
-            (old_p->pre_handler == aggr_pre_handler &&
+            (kprobe_aggrprobe(old_p) &&
             list_is_singular(&old_p->list))) {
                /*
                 * Only probe on the hash list. Disarm only if kprobes are
@@ -793,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                 * already have been removed. We save on flushing icache.
                 */
                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
-                        disarm_kprobe(p);
+                        disarm_kprobe(old_p);
                hlist_del_rcu(&old_p->hlist);
        } else {
                if (p->break_handler && !kprobe_gone(p))
@@ -809,8 +1251,13 @@ noclean:
                list_del_rcu(&p->list);
                if (!kprobe_disabled(old_p)) {
                        try_to_disable_aggr_kprobe(old_p);
-                        if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+                        if (!kprobes_all_disarmed) {
-                                disarm_kprobe(old_p);
+                                if (kprobe_disabled(old_p))
+                                        disarm_kprobe(old_p);
+                                else
+                                        /* Try to optimize this probe again */
+                                        optimize_kprobe(old_p);
+                        }
                }
        }
        return 0;
@@ -827,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
                old_p = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
                arch_remove_kprobe(old_p);
-                kfree(old_p);
+                free_aggr_kprobe(old_p);
        }
 }
@@ -1123,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        struct kprobe *kp;
        p->flags |= KPROBE_FLAG_GONE;
-        if (p->pre_handler == aggr_pre_handler) {
+        if (kprobe_aggrprobe(p)) {
                /*
                 * If this is an aggr_kprobe, we have to list all the
                 * chained probes and mark them GONE.
@@ -1132,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
                        kp->flags |= KPROBE_FLAG_GONE;
                p->post_handler = NULL;
                p->break_handler = NULL;
+                kill_optimized_kprobe(p);
        }
        /*
         * Here, we can remove insn_slot safely, because no thread calls
@@ -1241,6 +1689,15 @@ static int __init init_kprobes(void)
                }
        }
+#if defined(CONFIG_OPTPROBES)
+#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+        /* Init kprobe_optinsn_slots */
+        kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+        /* By default, kprobes can be optimized */
+        kprobes_allow_optimization = true;
+#endif
        /* By default, kprobes are armed */
        kprobes_all_disarmed = false;
@@ -1259,7 +1716,7 @@ static int __init init_kprobes(void)
 #ifdef CONFIG_DEBUG_FS
 static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
-                const char *sym, int offset,char *modname)
+                const char *sym, int offset, char *modname, struct kprobe *pp)
 {
        char *kprobe_type;
@@ -1269,19 +1726,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
                kprobe_type = "j";
        else
                kprobe_type = "k";
        if (sym)
-                seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+                seq_printf(pi, "%p  %s  %s+0x%x  %s ",
                        p->addr, kprobe_type, sym, offset,
-                        (modname ? modname : " "),
+                        (modname ? modname : " "));
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
-                         "[DISABLED]" : ""));
        else
-                seq_printf(pi, "%p  %s  %p %s%s\n",
+                seq_printf(pi, "%p  %s  %p ",
-                        p->addr, kprobe_type, p->addr,
+                        p->addr, kprobe_type, p->addr);
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+        if (!pp)
-                         "[DISABLED]" : ""));
+                pp = p;
+        seq_printf(pi, "%s%s%s\n",
+                (kprobe_gone(p) ? "[GONE]" : ""),
+                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
+                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
 }
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1317,11 +1776,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        hlist_for_each_entry_rcu(p, node, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
-                if (p->pre_handler == aggr_pre_handler) {
+                if (kprobe_aggrprobe(p)) {
                        list_for_each_entry_rcu(kp, &p->list, list)
-                                report_probe(pi, kp, sym, offset, modname);
+                                report_probe(pi, kp, sym, offset, modname, p);
                } else
-                        report_probe(pi, p, sym, offset, modname);
+                        report_probe(pi, p, sym, offset, modname, NULL);
        }
        preempt_enable();
        return 0;
@@ -1399,12 +1858,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
                goto out;
        }
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                arm_kprobe(p);
-        p->flags &= ~KPROBE_FLAG_DISABLED;
        if (p != kp)
                kp->flags &= ~KPROBE_FLAG_DISABLED;
+        if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+                p->flags &= ~KPROBE_FLAG_DISABLED;
+                arm_kprobe(p);
+        }
 out:
        mutex_unlock(&kprobe_mutex);
        return ret;
@@ -1424,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
        if (!kprobes_all_disarmed)
                goto already_enabled;
+        /* Arming kprobes doesn't optimize kprobe itself */
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
-                                arch_arm_kprobe(p);
+                                __arm_kprobe(p);
        }
        mutex_unlock(&text_mutex);
@@ -1456,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
+        /*
+         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
+         * because disarming may also unoptimize kprobes.
+         */
+        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                arch_disarm_kprobe(p);
+                                __disarm_kprobe(p);
                }
        }
        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        /* Allow all currently running kprobes to complete */
        synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(uevent_seqnum);
-/* uevent helper program, used during early boo */
+/* uevent helper program, used during early boot */
 static ssize_t uevent_helper_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
 {
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
                        goto group_exit;
        }
-        /* create the /sys/kernel/uids/ directory */
-        error = uids_sysfs_init();
-        if (error)
-                goto notes_exit;
        return 0;
-notes_exit:
-        if (notes_size > 0)
-                sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
        sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
+ * it.  See also kthread_run().
 *
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
-        set_mems_allowed(node_possible_map);
+        set_mems_allowed(node_states[N_HIGH_MEMORY]);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/list.h>
-#include <linux/slab.h>
 #include <linux/stacktrace.h>
 static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe49..2594e1ce41cb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
+#include <linux/gfp.h>
 #include <asm/sections.h>
@@ -582,9 +583,6 @@ static int static_obj(void *obj)
        unsigned long start = (unsigned long) &_stext,
                      end   = (unsigned long) &_end,
                      addr  = (unsigned long) obj;
-#ifdef CONFIG_SMP
-        int i;
-#endif
        /*
         * static variable?
@@ -595,24 +593,16 @@ static int static_obj(void *obj)
        if (arch_is_kernel_data(addr))
                return 1;
-#ifdef CONFIG_SMP
        /*
-         * percpu var?
+         * in-kernel percpu var?
         */
-        for_each_possible_cpu(i) {
+        if (is_kernel_percpu_address(addr))
-                start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+                return 1;
-                end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
-                                        + per_cpu_offset(i);
-                if ((addr >= start) && (addr < end))
-                        return 1;
-        }
-#endif
        /*
-         * module var?
+         * module static or percpu var?
         */
-        return is_module_address(addr);
+        return is_module_address(addr) || is_module_percpu_address(addr);
 }
 /*
@@ -2147,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
                return ret;
        return print_irq_inversion_bug(curr, &root, target_entry,
-                                        this, 1, irqclass);
+                                        this, 0, irqclass);
 }
 void print_irqtrace_events(struct task_struct *curr)
@@ -3211,8 +3201,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
        unsigned long flags;
-        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        if (unlikely(current->lockdep_recursion))
                return;
@@ -3220,6 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        __lock_acquire(lock, subclass, trylock, read, check,
                       irqs_disabled_flags(flags), nest_lock, ip, 0);
        current->lockdep_recursion = 0;
@@ -3232,14 +3221,13 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
        unsigned long flags;
-        trace_lock_release(lock, nested, ip);
        if (unlikely(current->lockdep_recursion))
                return;
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_release(lock, nested, ip);
        __lock_release(lock, nested, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3413,8 +3401,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
-        trace_lock_contended(lock, ip);
        if (unlikely(!lock_stat))
                return;
@@ -3424,6 +3410,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_contended(lock, ip);
        __lock_contended(lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3809,3 +3796,22 @@ void lockdep_sys_exit(void)
                lockdep_print_held_locks(curr);
        }
 }
+void lockdep_rcu_dereference(const char *file, const int line)
+{
+        struct task_struct *curr = current;
+        if (!debug_locks_off())
+                return;
+        printk("\n===================================================\n");
+        printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
+        printk(  "---------------------------------------------------\n");
+        printk("%s:%d invoked rcu_dereference_check() without protection!\n",
+                        file, line);
+        printk("\nother info that might help us debug this:\n\n");
+        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+        lockdep_print_held_locks(curr);
+        printk("\nstack backtrace:\n");
+        dump_stack();
+}
+EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..1016b75b026a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,27 +370,33 @@ EXPORT_SYMBOL_GPL(find_module);
 #ifdef CONFIG_SMP
-static void *percpu_modalloc(unsigned long size, unsigned long align,
+static inline void __percpu *mod_percpu(struct module *mod)
-                             const char *name)
 {
-        void *ptr;
+        return mod->percpu;
+}
+static int percpu_modalloc(struct module *mod,
+                           unsigned long size, unsigned long align)
+{
        if (align > PAGE_SIZE) {
                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                       name, align, PAGE_SIZE);
+                       mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }
-        ptr = __alloc_reserved_percpu(size, align);
+        mod->percpu = __alloc_reserved_percpu(size, align);
-        if (!ptr)
+        if (!mod->percpu) {
                printk(KERN_WARNING
                       "Could not allocate %lu bytes percpu data\n", size);
-        return ptr;
+                return -ENOMEM;
+        }
+        mod->percpu_size = size;
+        return 0;
 }
-static void percpu_modfree(void *freeme)
+static void percpu_modfree(struct module *mod)
 {
-        free_percpu(freeme);
+        free_percpu(mod->percpu);
 }
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
@@ -400,24 +406,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
        return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
 }
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+static void percpu_modcopy(struct module *mod,
+                           const void *from, unsigned long size)
 {
        int cpu;
        for_each_possible_cpu(cpu)
-                memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+        struct module *mod;
+        unsigned int cpu;
+        preempt_disable();
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if (!mod->percpu_size)
+                        continue;
+                for_each_possible_cpu(cpu) {
+                        void *start = per_cpu_ptr(mod->percpu, cpu);
+                        if ((void *)addr >= start &&
+                            (void *)addr < start + mod->percpu_size) {
+                                preempt_enable();
+                                return true;
+                        }
+                }
+        }
+        preempt_enable();
+        return false;
 }
 #else /* ... !CONFIG_SMP */
-static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+static inline void __percpu *mod_percpu(struct module *mod)
-                                    const char *name)
 {
        return NULL;
 }
-static inline void percpu_modfree(void *pcpuptr)
+static inline int percpu_modalloc(struct module *mod,
+                                  unsigned long size, unsigned long align)
+{
+        return -ENOMEM;
+}
+static inline void percpu_modfree(struct module *mod)
 {
-        BUG();
 }
 static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                        Elf_Shdr *sechdrs,
@@ -425,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
 {
        return 0;
 }
-static inline void percpu_modcopy(void *pcpudst, const void *src,
+static inline void percpu_modcopy(struct module *mod,
-                                  unsigned long size)
+                                  const void *from, unsigned long size)
 {
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
 }
+bool is_module_percpu_address(unsigned long addr)
+{
+        return false;
+}
 #endif /* CONFIG_SMP */
@@ -473,10 +521,13 @@ static void module_unload_init(struct module *mod)
        int cpu;
        INIT_LIST_HEAD(&mod->modules_which_use_me);
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                local_set(__module_ref_addr(mod, cpu), 0);
+                per_cpu_ptr(mod->refptr, cpu)->incs = 0;
+                per_cpu_ptr(mod->refptr, cpu)->decs = 0;
+        }
        /* Hold reference count during initialization. */
-        local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
+        __this_cpu_write(mod->refptr->incs, 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
 }
@@ -615,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 unsigned int module_refcount(struct module *mod)
 {
-        unsigned int total = 0;
+        unsigned int incs = 0, decs = 0;
        int cpu;
        for_each_possible_cpu(cpu)
-                total += local_read(__module_ref_addr(mod, cpu));
+                decs += per_cpu_ptr(mod->refptr, cpu)->decs;
-        return total;
+        /*
+         * ensure the incs are added up after the decs.
+         * module_put ensures incs are visible before decs with smp_wmb.
+         *
+         * This 2-count scheme avoids the situation where the refcount
+         * for CPU0 is read, then CPU0 increments the module refcount,
+         * then CPU1 drops that refcount, then the refcount for CPU1 is
+         * read. We would record a decrement but not its corresponding
+         * increment so we would see a low count (disaster).
+         *
+         * Rare situation? But module_refcount can be preempted, and we
+         * might be tallying up 4096+ CPUs. So it is not impossible.
+         */
+        smp_rmb();
+        for_each_possible_cpu(cpu)
+                incs += per_cpu_ptr(mod->refptr, cpu)->incs;
+        return incs - decs;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -796,14 +863,16 @@ static struct module_attribute refcnt = {
 void module_put(struct module *module)
 {
        if (module) {
-                unsigned int cpu = get_cpu();
+                preempt_disable();
-                local_dec(__module_ref_addr(module, cpu));
+                smp_wmb(); /* see comment in module_refcount */
+                __this_cpu_inc(module->refptr->decs);
                trace_module_put(module, _RET_IP_,
-                                 local_read(__module_ref_addr(module, cpu)));
+                                 __this_cpu_read(module->refptr->decs));
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
-                put_cpu();
+                preempt_enable();
        }
 }
 EXPORT_SYMBOL(module_put);
@@ -1083,6 +1152,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
                if (sattr->name == NULL)
                        goto out;
                sect_attrs->nsections++;
+                sysfs_attr_init(&sattr->mattr.attr);
                sattr->mattr.show = module_sect_show;
                sattr->mattr.store = NULL;
                sattr->mattr.attr.name = sattr->name;
@@ -1178,6 +1248,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
                if (sect_empty(&sechdrs[i]))
                        continue;
                if (sechdrs[i].sh_type == SHT_NOTE) {
+                        sysfs_bin_attr_init(nattr);
                        nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
                        nattr->attr.mode = S_IRUGO;
                        nattr->size = sechdrs[i].sh_size;
@@ -1250,6 +1321,7 @@ int module_add_modinfo_attrs(struct module *mod)
                if (!attr->test ||
                    (attr->test && attr->test(mod))) {
                        memcpy(temp_attr, attr, sizeof(*temp_attr));
+                        sysfs_attr_init(&temp_attr->attr);
                        error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
                        ++temp_attr;
                }
@@ -1395,11 +1467,10 @@ static void free_module(struct module *mod)
        /* This may be NULL, but that's OK */
        module_free(mod, mod->module_init);
        kfree(mod->args);
-        if (mod->percpu)
+        percpu_modfree(mod);
-                percpu_modfree(mod->percpu);
+#if defined(CONFIG_MODULE_UNLOAD)
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
        if (mod->refptr)
-                percpu_modfree(mod->refptr);
+                free_percpu(mod->refptr);
 #endif
        /* Free lock-classes: */
        lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1515,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
                default:
                        /* Divert to percpu allocation if a percpu var. */
                        if (sym[i].st_shndx == pcpuindex)
-                                secbase = (unsigned long)mod->percpu;
+                                secbase = (unsigned long)mod_percpu(mod);
                        else
                                secbase = sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
@@ -1949,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod,
        unsigned int modindex, versindex, infoindex, pcpuindex;
        struct module *mod;
        long err = 0;
-        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+        void *ptr = NULL; /* Stops spurious gcc warning */
        unsigned long symoffs, stroffs, *strmap;
        mm_segment_t old_fs;
@@ -2089,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod,
        if (pcpuindex) {
                /* We have a special allocation for this section. */
-                percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+                err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
-                                         sechdrs[pcpuindex].sh_addralign,
+                                      sechdrs[pcpuindex].sh_addralign);
-                                         mod->name);
+                if (err)
-                if (!percpu) {
-                        err = -ENOMEM;
                        goto free_mod;
-                }
                sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-                mod->percpu = percpu;
        }
        /* Determine total sizes, and put offsets in sh_entsize.  For now
@@ -2162,9 +2229,8 @@ static noinline struct module *load_module(void __user *umod,
        mod = (void *)sechdrs[modindex].sh_addr;
        kmemleak_load_module(mod, hdr, sechdrs, secstrings);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+        mod->refptr = alloc_percpu(struct module_ref);
-                                      mod->name);
        if (!mod->refptr) {
                err = -ENOMEM;
                goto free_init;
@@ -2313,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod,
        sort_extable(mod->extable, mod->extable + mod->num_exentries);
        /* Finally, copy percpu area over. */
-        percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+        percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
                       sechdrs[pcpuindex].sh_size);
        add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2396,8 +2462,8 @@ static noinline struct module *load_module(void __user *umod,
        kobject_put(&mod->mkobj.kobj);
 free_unload:
        module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        percpu_modfree(mod->refptr);
+        free_percpu(mod->refptr);
 free_init:
 #endif
        module_free(mod, mod->module_init);
@@ -2405,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod,
        module_free(mod, mod->module_core);
        /* mod will be freed with core. Don't access it beyond this line! */
 free_percpu:
-        if (percpu)
+        percpu_modfree(mod);
-                percpu_modfree(percpu);
 free_mod:
        kfree(args);
        kfree(strmap);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;
-        nb = rcu_dereference(*nl);
+        nb = rcu_dereference_raw(*nl);
        while (nb && nr_to_call) {
-                next_nb = rcu_dereference(nb->next);
+                next_nb = rcu_dereference_raw(nb->next);
 #ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
-        if (rcu_dereference(nh->head)) {
+        if (rcu_dereference_raw(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
                                        nr_calls);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
 *             Pavel Emelianov <xemul@openvz.org>
 */
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
@@ -24,7 +25,18 @@
 static struct kmem_cache *nsproxy_cachep;
-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+struct nsproxy init_nsproxy = {
+        .count  = ATOMIC_INIT(1),
+        .uts_ns = &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+        .ipc_ns = &init_ipc_ns,
+#endif
+        .mnt_ns = NULL,
+        .pid_ns = &init_pid_ns,
+#ifdef CONFIG_NET
+        .net_ns = &init_net,
+#endif
+};
 static inline struct nsproxy *create_nsproxy(void)
 {
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..fd03513c7327
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,697 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_OBJ_NUM 10000 * NR_CPUS
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+        int cpu, target_cpu;
+        target_cpu = cpumask_first(pd->cpumask);
+        for (cpu = 0; cpu < cpu_index; cpu++)
+                target_cpu = cpumask_next(target_cpu, pd->cpumask);
+        return target_cpu;
+}
+static int padata_cpu_hash(struct padata_priv *padata)
+{
+        int cpu_index;
+        struct parallel_data *pd;
+        pd =  padata->pd;
+        /*
+         * Hash the sequence numbers to the cpus by taking
+         * seq_nr mod. number of cpus in use.
+         */
+        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask);
+        return padata_index_to_cpu(pd, cpu_index);
+}
+static void padata_parallel_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        struct padata_instance *pinst;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, pwork);
+        pd = queue->pd;
+        pinst = pd->pinst;
+        spin_lock(&queue->parallel.lock);
+        list_replace_init(&queue->parallel.list, &local_list);
+        spin_unlock(&queue->parallel.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->parallel(padata);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ *          must be in the cpumask of padata.
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+                       struct padata_priv *padata, int cb_cpu)
+{
+        int target_cpu, err;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        rcu_read_lock_bh();
+        pd = rcu_dereference(pinst->pd);
+        err = 0;
+        if (!(pinst->flags & PADATA_INIT))
+                goto out;
+        err =  -EBUSY;
+        if ((pinst->flags & PADATA_RESET))
+                goto out;
+        if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+                goto out;
+        err = -EINVAL;
+        if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
+                goto out;
+        err = -EINPROGRESS;
+        atomic_inc(&pd->refcnt);
+        padata->pd = pd;
+        padata->cb_cpu = cb_cpu;
+        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+                atomic_set(&pd->seq_nr, -1);
+        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+        target_cpu = padata_cpu_hash(padata);
+        queue = per_cpu_ptr(pd->queue, target_cpu);
+        spin_lock(&queue->parallel.lock);
+        list_add_tail(&padata->list, &queue->parallel.list);
+        spin_unlock(&queue->parallel.lock);
+        queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+out:
+        rcu_read_unlock_bh();
+        return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+        int cpu, num_cpus, empty, calc_seq_nr;
+        int seq_nr, next_nr, overrun, next_overrun;
+        struct padata_queue *queue, *next_queue;
+        struct padata_priv *padata;
+        struct padata_list *reorder;
+        empty = 0;
+        next_nr = -1;
+        next_overrun = 0;
+        next_queue = NULL;
+        num_cpus = cpumask_weight(pd->cpumask);
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                reorder = &queue->reorder;
+                /*
+                 * Calculate the seq_nr of the object that should be
+                 * next in this queue.
+                 */
+                overrun = 0;
+                calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
+                               + queue->cpu_index;
+                if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
+                        calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+                        overrun = 1;
+                }
+                if (!list_empty(&reorder->list)) {
+                        padata = list_entry(reorder->list.next,
+                                            struct padata_priv, list);
+                        seq_nr  = padata->seq_nr;
+                        BUG_ON(calc_seq_nr != seq_nr);
+                } else {
+                        seq_nr = calc_seq_nr;
+                        empty++;
+                }
+                if (next_nr < 0 || seq_nr < next_nr
+                    || (next_overrun && !overrun)) {
+                        next_nr = seq_nr;
+                        next_overrun = overrun;
+                        next_queue = queue;
+                }
+        }
+        padata = NULL;
+        if (empty == num_cpus)
+                goto out;
+        reorder = &next_queue->reorder;
+        if (!list_empty(&reorder->list)) {
+                padata = list_entry(reorder->list.next,
+                                    struct padata_priv, list);
+                if (unlikely(next_overrun)) {
+                        for_each_cpu(cpu, pd->cpumask) {
+                                queue = per_cpu_ptr(pd->queue, cpu);
+                                atomic_set(&queue->num_obj, 0);
+                        }
+                }
+                spin_lock(&reorder->lock);
+                list_del_init(&padata->list);
+                atomic_dec(&pd->reorder_objects);
+                spin_unlock(&reorder->lock);
+                atomic_inc(&next_queue->num_obj);
+                goto out;
+        }
+        if (next_nr % num_cpus == next_queue->cpu_index) {
+                padata = ERR_PTR(-ENODATA);
+                goto out;
+        }
+        padata = ERR_PTR(-EINPROGRESS);
+out:
+        return padata;
+}
+static void padata_reorder(struct parallel_data *pd)
+{
+        struct padata_priv *padata;
+        struct padata_queue *queue;
+        struct padata_instance *pinst = pd->pinst;
+try_again:
+        if (!spin_trylock_bh(&pd->lock))
+                goto out;
+        while (1) {
+                padata = padata_get_next(pd);
+                if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+                        break;
+                if (PTR_ERR(padata) == -ENODATA) {
+                        spin_unlock_bh(&pd->lock);
+                        goto out;
+                }
+                queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+                spin_lock(&queue->serial.lock);
+                list_add_tail(&padata->list, &queue->serial.list);
+                spin_unlock(&queue->serial.lock);
+                queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+        }
+        spin_unlock_bh(&pd->lock);
+        if (atomic_read(&pd->reorder_objects))
+                goto try_again;
+out:
+        return;
+}
+static void padata_serial_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, swork);
+        pd = queue->pd;
+        spin_lock(&queue->serial.lock);
+        list_replace_init(&queue->serial.list, &local_list);
+        spin_unlock(&queue->serial.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->serial(padata);
+                atomic_dec(&pd->refcnt);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+        int cpu;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        pd = padata->pd;
+        cpu = get_cpu();
+        queue = per_cpu_ptr(pd->queue, cpu);
+        spin_lock(&queue->reorder.lock);
+        atomic_inc(&pd->reorder_objects);
+        list_add_tail(&padata->list, &queue->reorder.list);
+        spin_unlock(&queue->reorder.lock);
+        put_cpu();
+        padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+                                             const struct cpumask *cpumask)
+{
+        int cpu, cpu_index, num_cpus;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        cpu_index = 0;
+        pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+        if (!pd)
+                goto err;
+        pd->queue = alloc_percpu(struct padata_queue);
+        if (!pd->queue)
+                goto err_free_pd;
+        if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
+                goto err_free_queue;
+        for_each_possible_cpu(cpu) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                queue->pd = pd;
+                if (cpumask_test_cpu(cpu, cpumask)
+                    && cpumask_test_cpu(cpu, cpu_active_mask)) {
+                        queue->cpu_index = cpu_index;
+                        cpu_index++;
+                } else
+                        queue->cpu_index = -1;
+                INIT_LIST_HEAD(&queue->reorder.list);
+                INIT_LIST_HEAD(&queue->parallel.list);
+                INIT_LIST_HEAD(&queue->serial.list);
+                spin_lock_init(&queue->reorder.lock);
+                spin_lock_init(&queue->parallel.lock);
+                spin_lock_init(&queue->serial.lock);
+                INIT_WORK(&queue->pwork, padata_parallel_worker);
+                INIT_WORK(&queue->swork, padata_serial_worker);
+                atomic_set(&queue->num_obj, 0);
+        }
+        cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+        num_cpus = cpumask_weight(pd->cpumask);
+        pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+        atomic_set(&pd->seq_nr, -1);
+        atomic_set(&pd->reorder_objects, 0);
+        atomic_set(&pd->refcnt, 0);
+        pd->pinst = pinst;
+        spin_lock_init(&pd->lock);
+        return pd;
+err_free_queue:
+        free_percpu(pd->queue);
+err_free_pd:
+        kfree(pd);
+err:
+        return NULL;
+}
+static void padata_free_pd(struct parallel_data *pd)
+{
+        free_cpumask_var(pd->cpumask);
+        free_percpu(pd->queue);
+        kfree(pd);
+}
+static void padata_replace(struct padata_instance *pinst,
+                           struct parallel_data *pd_new)
+{
+        struct parallel_data *pd_old = pinst->pd;
+        pinst->flags |= PADATA_RESET;
+        rcu_assign_pointer(pinst->pd, pd_new);
+        synchronize_rcu();
+        while (atomic_read(&pd_old->refcnt) != 0)
+                yield();
+        flush_workqueue(pinst->wq);
+        padata_free_pd(pd_old);
+        pinst->flags &= ~PADATA_RESET;
+}
+/*
+ * padata_set_cpumask - set the cpumask that padata should use
+ *
+ * @pinst: padata instance
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst,
+                        cpumask_var_t cpumask)
+{
+        struct parallel_data *pd;
+        int err = 0;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd) {
+                err = -ENOMEM;
+                goto out;
+        }
+        cpumask_copy(pinst->cpumask, cpumask);
+        padata_replace(pinst, pd);
+out:
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_add_cpu - add a cpu to the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ */
+int padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_set_cpu(cpu, pinst->cpumask);
+        err = __padata_add_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_remove_cpu - remove a cpu from the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_clear_cpu(cpu, pinst->cpumask);
+        err = __padata_remove_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+/*
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+void padata_start(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags |= PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_start);
+/*
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags &= ~PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
+                                         unsigned long action, void *hcpu)
+{
+        int err;
+        struct padata_instance *pinst;
+        int cpu = (unsigned long)hcpu;
+        pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        }
+        return NOTIFY_OK;
+}
+/*
+ * padata_alloc - allocate and initialize a padata instance
+ *
+ * @cpumask: cpumask that padata uses for parallelization
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc(const struct cpumask *cpumask,
+                                     struct workqueue_struct *wq)
+{
+        int err;
+        struct padata_instance *pinst;
+        struct parallel_data *pd;
+        pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+        if (!pinst)
+                goto err;
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd)
+                goto err_free_inst;
+        if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
+                goto err_free_pd;
+        rcu_assign_pointer(pinst->pd, pd);
+        pinst->wq = wq;
+        cpumask_copy(pinst->cpumask, cpumask);
+        pinst->flags = 0;
+        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+        pinst->cpu_notifier.priority = 0;
+        err = register_hotcpu_notifier(&pinst->cpu_notifier);
+        if (err)
+                goto err_free_cpumask;
+        mutex_init(&pinst->lock);
+        return pinst;
+err_free_cpumask:
+        free_cpumask_var(pinst->cpumask);
+err_free_pd:
+        padata_free_pd(pd);
+err_free_inst:
+        kfree(pinst);
+err:
+        return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+/*
+ * padata_free - free a padata instance
+ *
+ * @ padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+        padata_stop(pinst);
+        synchronize_rcu();
+        while (atomic_read(&pinst->pd->refcnt) != 0)
+                yield();
+        unregister_hotcpu_notifier(&pinst->cpu_notifier);
+        padata_free_pd(pinst->pd);
+        free_cpumask_var(pinst->cpumask);
+        kfree(pinst);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
-static long no_blink(long time)
-{
-        return 0;
-}
 /* Returns how long it waited in ms */
 long (*panic_blink)(long time);
 EXPORT_SYMBOL(panic_blink);
+static void panic_blink_one_second(void)
+{
+        static long i = 0, end;
+        if (panic_blink) {
+                end = i + MSEC_PER_SEC;
+                while (i < end) {
+                        i += panic_blink(i);
+                        mdelay(1);
+                        i++;
+                }
+        } else {
+                /*
+                 * When running under a hypervisor a small mdelay may get
+                 * rounded up to the hypervisor timeslice. For example, with
+                 * a 1ms in 10ms hypervisor timeslice we might inflate a
+                 * mdelay(1) loop by 10x.
+                 *
+                 * If we have nothing to blink, spin on 1 second calls to
+                 * mdelay to avoid this.
+                 */
+                mdelay(MSEC_PER_SEC);
+        }
+}
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
        bust_spinlocks(0);
-        if (!panic_blink)
-                panic_blink = no_blink;
        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
                 */
                printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
-                for (i = 0; i < panic_timeout*1000; ) {
+                for (i = 0; i < panic_timeout; i++) {
                        touch_nmi_watchdog();
-                        i += panic_blink(i);
+                        panic_blink_one_second();
-                        mdelay(1);
-                        i++;
                }
                /*
                 * This will not be a clean reboot, with everything
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
        }
 #endif
        local_irq_enable();
-        for (i = 0; ; ) {
+        while (1) {
                touch_softlockup_watchdog();
-                i += panic_blink(i);
+                panic_blink_one_second();
-                mdelay(1);
-                i++;
        }
 }
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#include <linux/string.h>
 #if 0
 #define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 }
 /* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
 extern struct kernel_param __start___param[], __stop___param[];
@@ -421,7 +420,7 @@ struct module_param_attrs
 };
 #ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 static ssize_t param_attr_show(struct module_attribute *mattr,
                               struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
        new->grp.attrs = attrs;
        /* Tack new one on the end. */
+        sysfs_attr_init(&new->attrs[num].mattr.attr);
        new->attrs[num].param = kp;
        new->attrs[num].mattr.show = param_attr_show;
        new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
        return ret;
 }
-static struct sysfs_ops module_sysfs_ops = {
+static const struct sysfs_ops module_sysfs_ops = {
        .show = module_attr_show,
        .store = module_attr_store,
 };
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
        return 0;
 }
-static struct kset_uevent_ops module_uevent_ops = {
+static const struct kset_uevent_ops module_uevent_ops = {
        .filter = uevent_filter,
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 087025fe3ba1..3d1552d3c12b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,7 @@
 #include <linux/smp.h>
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -56,21 +57,6 @@ static atomic_t nr_task_events __read_mostly;
 */
 int sysctl_perf_event_paranoid __read_mostly = 1;
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-        return sysctl_perf_event_paranoid > -1;
-}
-static inline bool perf_paranoid_cpu(void)
-{
-        return sysctl_perf_event_paranoid > 0;
-}
-static inline bool perf_paranoid_kernel(void)
-{
-        return sysctl_perf_event_paranoid > 1;
-}
 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
 /*
@@ -96,13 +82,10 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)               { barrier(); }
 void __weak hw_perf_enable(void)                { barrier(); }
-void __weak hw_perf_event_setup(int cpu)        { barrier(); }
-void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx, int cpu)
+               struct perf_event_context *ctx)
 {
        return 0;
 }
@@ -111,25 +94,15 @@ void __weak perf_event_print_debug(void)	{ }
 static DEFINE_PER_CPU(int, perf_disable_count);
-void __perf_disable(void)
-{
-        __get_cpu_var(perf_disable_count)++;
-}
-bool __perf_enable(void)
-{
-        return !--__get_cpu_var(perf_disable_count);
-}
 void perf_disable(void)
 {
-        __perf_disable();
+        if (!__get_cpu_var(perf_disable_count)++)
-        hw_perf_disable();
+                hw_perf_disable();
 }
 void perf_enable(void)
 {
-        if (__perf_enable())
+        if (!--__get_cpu_var(perf_disable_count))
                hw_perf_enable();
 }
@@ -248,7 +221,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 static inline u64 perf_clock(void)
 {
-        return cpu_clock(smp_processor_id());
+        return cpu_clock(raw_smp_processor_id());
 }
 /*
@@ -632,14 +605,13 @@ void perf_event_disable(struct perf_event *event)
 static int
 event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx,
+                 struct perf_event_context *ctx)
-                 int cpu)
 {
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
        event->state = PERF_EVENT_STATE_ACTIVE;
-        event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
+        event->oncpu = smp_processor_id();
        /*
         * The new state must be visible before we turn it on in the hardware:
         */
@@ -666,8 +638,7 @@ event_sched_in(struct perf_event *event,
 static int
 group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx,
+               struct perf_event_context *ctx)
-               int cpu)
 {
        struct perf_event *event, *partial_group;
        int ret;
@@ -675,18 +646,18 @@ group_sched_in(struct perf_event *group_event,
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
        if (ret)
                return ret < 0 ? ret : 0;
-        if (event_sched_in(group_event, cpuctx, ctx, cpu))
+        if (event_sched_in(group_event, cpuctx, ctx))
                return -EAGAIN;
        /*
         * Schedule in siblings as one group (if any):
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-                if (event_sched_in(event, cpuctx, ctx, cpu)) {
+                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
@@ -760,7 +731,6 @@ static void __perf_install_in_context(void *info)
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
-        int cpu = smp_processor_id();
        int err;
        /*
@@ -807,7 +777,7 @@ static void __perf_install_in_context(void *info)
        if (!group_can_go_on(event, cpuctx, 1))
                err = -EEXIST;
        else
-                err = event_sched_in(event, cpuctx, ctx, cpu);
+                err = event_sched_in(event, cpuctx, ctx);
        if (err) {
                /*
@@ -949,11 +919,9 @@ static void __perf_event_enable(void *info)
        } else {
                perf_disable();
                if (event == leader)
-                        err = group_sched_in(event, cpuctx, ctx,
+                        err = group_sched_in(event, cpuctx, ctx);
-                                             smp_processor_id());
                else
-                        err = event_sched_in(event, cpuctx, ctx,
+                        err = event_sched_in(event, cpuctx, ctx);
-                                               smp_processor_id());
                perf_enable();
        }
@@ -1197,11 +1165,9 @@ void perf_event_task_sched_out(struct task_struct *task,
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
-        struct pt_regs *regs;
        int do_switch = 1;
-        regs = task_pt_regs(task);
+        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
        if (likely(!ctx || !cpuctx->task_ctx))
                return;
@@ -1280,19 +1246,18 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 static void
 ctx_pinned_sched_in(struct perf_event_context *ctx,
-                    struct perf_cpu_context *cpuctx,
+                    struct perf_cpu_context *cpuctx)
-                    int cpu)
 {
        struct perf_event *event;
        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
                if (event->state <= PERF_EVENT_STATE_OFF)
                        continue;
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, 1))
-                        group_sched_in(event, cpuctx, ctx, cpu);
+                        group_sched_in(event, cpuctx, ctx);
                /*
                 * If this pinned group hasn't been scheduled,
@@ -1307,8 +1272,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 static void
 ctx_flexible_sched_in(struct perf_event_context *ctx,
-                      struct perf_cpu_context *cpuctx,
+                      struct perf_cpu_context *cpuctx)
-                      int cpu)
 {
        struct perf_event *event;
        int can_add_hw = 1;
@@ -1321,11 +1285,11 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
                 * Listen to the 'cpu' scheduling filter constraint
                 * of events:
                 */
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, can_add_hw))
-                        if (group_sched_in(event, cpuctx, ctx, cpu))
+                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
        }
 }
@@ -1335,8 +1299,6 @@ ctx_sched_in(struct perf_event_context *ctx,
             struct perf_cpu_context *cpuctx,
             enum event_type_t event_type)
 {
-        int cpu = smp_processor_id();
        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        if (likely(!ctx->nr_events))
@@ -1351,11 +1313,11 @@ ctx_sched_in(struct perf_event_context *ctx,
         * in order to give them the best chance of going on.
         */
        if (event_type & EVENT_PINNED)
-                ctx_pinned_sched_in(ctx, cpuctx, cpu);
+                ctx_pinned_sched_in(ctx, cpuctx);
        /* Then walk through the lower prio flexible groups */
        if (event_type & EVENT_FLEXIBLE)
-                ctx_flexible_sched_in(ctx, cpuctx, cpu);
+                ctx_flexible_sched_in(ctx, cpuctx);
        perf_enable();
 out:
@@ -1493,6 +1455,22 @@ do {					\
        return div64_u64(dividend, divisor);
 }
+static void perf_event_stop(struct perf_event *event)
+{
+        if (!event->pmu->stop)
+                return event->pmu->disable(event);
+        return event->pmu->stop(event);
+}
+static int perf_event_start(struct perf_event *event)
+{
+        if (!event->pmu->start)
+                return event->pmu->enable(event);
+        return event->pmu->start(event);
+}
 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
@@ -1513,9 +1491,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
        if (atomic64_read(&hwc->period_left) > 8*sample_period) {
                perf_disable();
-                event->pmu->disable(event);
+                perf_event_stop(event);
                atomic64_set(&hwc->period_left, 0);
-                event->pmu->enable(event);
+                perf_event_start(event);
                perf_enable();
        }
 }
@@ -1545,12 +1523,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
+                        perf_disable();
                        event->pmu->unthrottle(event);
+                        perf_enable();
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
+                perf_disable();
                event->pmu->read(event);
                now = atomic64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
@@ -1558,6 +1539,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                if (delta > 0)
                        perf_adjust_period(event, TICK_NSEC, delta);
+                perf_enable();
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1567,9 +1549,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-        if (!ctx->nr_events)
-                return;
        raw_spin_lock(&ctx->lock);
        /* Rotate the first entry last of non-pinned groups */
@@ -1582,19 +1561,28 @@ void perf_event_task_tick(struct task_struct *curr)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
+        int rotate = 0;
        if (!atomic_read(&nr_events))
                return;
        cpuctx = &__get_cpu_var(perf_cpu_context);
-        ctx = curr->perf_event_ctxp;
+        if (cpuctx->ctx.nr_events &&
+            cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                rotate = 1;
-        perf_disable();
+        ctx = curr->perf_event_ctxp;
+        if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+                rotate = 1;
        perf_ctx_adjust_freq(&cpuctx->ctx);
        if (ctx)
                perf_ctx_adjust_freq(ctx);
+        if (!rotate)
+                return;
+        perf_disable();
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1606,7 +1594,6 @@ void perf_event_task_tick(struct task_struct *curr)
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
        perf_enable();
 }
@@ -2602,7 +2589,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        if (user_locked > user_lock_limit)
                extra = user_locked - user_lock_limit;
-        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = vma->vm_mm->locked_vm + extra;
@@ -2798,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        return NULL;
 }
+__weak
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
+{
+}
 /*
 * Output
 */
@@ -3383,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-        int size;
        struct task_struct *task = task_event->task;
-        int ret;
+        unsigned long flags;
+        int size, ret;
+        /*
+         * If this CPU attempts to acquire an rq lock held by a CPU spinning
+         * in perf_output_lock() from interrupt context, it's game over.
+         */
+        local_irq_save(flags);
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
-        if (ret)
+        if (ret) {
+                local_irq_restore(flags);
                return;
+        }
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3399,16 +3400,15 @@ static void perf_event_task_output(struct perf_event *event,
        task_event->event_id.tid = perf_event_tid(event, task);
        task_event->event_id.ptid = perf_event_tid(event, current);
-        task_event->event_id.time = perf_clock();
        perf_output_put(&handle, task_event->event_id);
        perf_output_end(&handle);
+        local_irq_restore(flags);
 }
 static int perf_event_task_match(struct perf_event *event)
 {
-        if (event->state != PERF_EVENT_STATE_ACTIVE)
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3440,7 +3440,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_task_ctx(&cpuctx->ctx, task_event);
        if (!ctx)
-                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+                ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_task_ctx(ctx, task_event);
        put_cpu_var(perf_cpu_context);
@@ -3471,6 +3471,7 @@ static void perf_event_task(struct task_struct *task,
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
+                        .time = perf_clock(),
                },
        };
@@ -3520,7 +3521,7 @@ static void perf_event_comm_output(struct perf_event *event,
 static int perf_event_comm_match(struct perf_event *event)
 {
-        if (event->state != PERF_EVENT_STATE_ACTIVE)
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3640,7 +3641,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 static int perf_event_mmap_match(struct perf_event *event,
                                   struct perf_mmap_event *mmap_event)
 {
-        if (event->state != PERF_EVENT_STATE_ACTIVE)
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3749,7 +3750,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
-                        .pgoff  = vma->vm_pgoff,
+                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
        };
@@ -4116,8 +4117,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        if (rctx < 0)
                return;
-        data.addr = addr;
+        perf_sample_data_init(&data, addr);
-        data.raw  = NULL;
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
@@ -4162,11 +4162,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        struct perf_event *event;
        u64 period;
-        event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
+        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
        event->pmu->read(event);
-        data.addr = 0;
+        perf_sample_data_init(&data, 0);
-        data.raw = NULL;
        data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
@@ -4328,26 +4327,20 @@ static const struct pmu perf_ops_task_clock = {
 #ifdef CONFIG_EVENT_TRACING
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-                          int entry_size)
+                   int entry_size, struct pt_regs *regs)
 {
+        struct perf_sample_data data;
        struct perf_raw_record raw = {
                .size = entry_size,
                .data = record,
        };
-        struct perf_sample_data data = {
+        perf_sample_data_init(&data, addr);
-                .addr = addr,
+        data.raw = &raw;
-                .raw = &raw,
-        };
-        struct pt_regs *regs = get_irq_regs();
-        if (!regs)
-                regs = task_pt_regs(current);
        /* Trace events already protected against recursion */
        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-                                &data, regs);
+                         &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -4363,7 +4356,7 @@ static int perf_tp_event_match(struct perf_event *event,
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-        ftrace_profile_disable(event->attr.config);
+        perf_trace_disable(event->attr.config);
 }
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4377,7 +4370,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
-        if (ftrace_profile_enable(event->attr.config))
+        if (perf_trace_enable(event->attr.config))
                return NULL;
        event->destroy = tp_perf_event_destroy;
@@ -4456,8 +4449,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
-        sample.raw = NULL;
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
-        sample.addr = bp->attr.bp_addr;
        if (!perf_exclude_event(bp, regs))
                perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4720,7 +4712,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->type >= PERF_TYPE_MAX)
                return -EINVAL;
-        if (attr->__reserved_1 || attr->__reserved_2)
+        if (attr->__reserved_1)
                return -EINVAL;
        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4905,7 +4897,7 @@ err_fput_free_put_context:
 err_free_put_context:
        if (err < 0)
-                kfree(event);
+                free_event(event);
 err_put_context:
        if (err < 0)
@@ -5385,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child)
        return ret;
 }
+static void __init perf_event_init_all_cpus(void)
+{
+        int cpu;
+        struct perf_cpu_context *cpuctx;
+        for_each_possible_cpu(cpu) {
+                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                __perf_event_init_context(&cpuctx->ctx, NULL);
+        }
+}
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
        struct perf_cpu_context *cpuctx;
        cpuctx = &per_cpu(perf_cpu_context, cpu);
-        __perf_event_init_context(&cpuctx->ctx, NULL);
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
-        hw_perf_event_setup(cpu);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5436,11 +5436,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
                perf_event_init_cpu(cpu);
                break;
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                hw_perf_event_setup_online(cpu);
-                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                perf_event_exit_cpu(cpu);
@@ -5463,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 void __init perf_event_init(void)
 {
+        perf_event_init_all_cpus();
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5470,13 +5466,16 @@ void __init perf_event_init(void)
        register_cpu_notifier(&perf_cpu_nb);
 }
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
+                                        struct sysdev_class_attribute *attr,
+                                        char *buf)
 {
        return sprintf(buf, "%d\n", perf_reserved_percpu);
 }
 static ssize_t
 perf_set_reserve_percpu(struct sysdev_class *class,
+                        struct sysdev_class_attribute *attr,
                        const char *buf,
                        size_t count)
 {
@@ -5505,13 +5504,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
        return count;
 }
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_overcommit(struct sysdev_class *class,
+                                    struct sysdev_class_attribute *attr,
+                                    char *buf)
 {
        return sprintf(buf, "%d\n", perf_overcommit);
 }
 static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+perf_set_overcommit(struct sysdev_class *class,
+                    struct sysdev_class_attribute *attr,
+                    const char *buf, size_t count)
 {
        unsigned long val;
        int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
-                first = rcu_dereference(pid->tasks[type].first);
+                first = rcu_dereference_check(pid->tasks[type].first,
+                                              rcu_read_lock_held() ||
+                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
        }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 EXPORT_SYMBOL(pid_task);
 /*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ * Must be called under rcu_read_lock().
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
+#include <linux/slab.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rcu_read_lock();
                /*
-                 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+                 * Any nested-container's init processes won't ignore the
-                 * any nested-container's init processes don't ignore the
+                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-                 * signal
                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
                if (task)
-                        force_sig(SIGKILL, task);
+                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
                rcu_read_unlock();
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..bc7704b3a443 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
        int maxfire;
        struct list_head *timers = tsk->cpu_timers;
        struct signal_struct *const sig = tsk->signal;
+        unsigned long soft;
        maxfire = 20;
        tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
        /*
         * Check for the special case thread timers.
         */
-        if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
-                unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+        if (soft != RLIM_INFINITY) {
-                unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
                if (hard != RLIM_INFINITY &&
                    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+                if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
-                        if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+                                soft += USEC_PER_SEC;
-                                sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+                                sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
-                                                                USEC_PER_SEC;
                        }
                        printk(KERN_INFO
                                "RT Watchdog Timeout: %s[%d]\n",
@@ -1060,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 }
-static void stop_process_timers(struct task_struct *tsk)
+static void stop_process_timers(struct signal_struct *sig)
 {
-        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+        struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
        if (!cputimer->running)
@@ -1071,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
        spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
        spin_unlock_irqrestore(&cputimer->lock, flags);
+        sig->cputime_expires.prof_exp = cputime_zero;
+        sig->cputime_expires.virt_exp = cputime_zero;
+        sig->cputime_expires.sched_exp = 0;
 }
 static u32 onecputick;
@@ -1121,6 +1126,7 @@ static void check_process_timers(struct task_struct *tsk,
        unsigned long long sum_sched_runtime, sched_expires;
        struct list_head *timers = sig->cpu_timers;
        struct task_cputime cputime;
+        unsigned long soft;
        /*
         * Don't sample the current process CPU clocks if there are no timers.
@@ -1131,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
            list_empty(&timers[CPUCLOCK_VIRT]) &&
            cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
            list_empty(&timers[CPUCLOCK_SCHED])) {
-                stop_process_timers(tsk);
+                stop_process_timers(sig);
                return;
        }
@@ -1193,11 +1199,13 @@ static void check_process_timers(struct task_struct *tsk,
                         SIGPROF);
        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
                         SIGVTALRM);
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+        if (soft != RLIM_INFINITY) {
                unsigned long psecs = cputime_to_secs(ptime);
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
                cputime_t x;
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+                if (psecs >= hard) {
                        /*
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
@@ -1205,17 +1213,17 @@ static void check_process_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+                if (psecs >= soft) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
-                        if (sig->rlim[RLIMIT_CPU].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_CPU].rlim_max) {
+                                soft++;
-                                sig->rlim[RLIMIT_CPU].rlim_cur++;
+                                sig->rlim[RLIMIT_CPU].rlim_cur = soft;
                        }
                }
-                x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+                x = secs_to_cputime(soft);
                if (cputime_eq(prof_expires, cputime_zero) ||
                    cputime_lt(x, prof_expires)) {
                        prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
        return 0;
 }
-int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
 {
        *tp = ktime_to_timespec(KTIME_LOW_RES);
        return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting PM bugs, like
        suspend support.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        default n
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
 config PM_VERBOSE
        bool "Verbose Power Management debugging"
        depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
+config PM_SLEEP_ADVANCED_DEBUG
+        bool
+        depends on PM_ADVANCED_DEBUG
+        default n
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
          and the bus type drivers of the buses the devices are on are
          responsible for the actual handling of the autosuspend requests and
          wake-up events.
+config PM_OPS
+        bool
+        depends on PM_SLEEP || PM_RUNTIME
+        default y
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/gfp.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
@@ -323,6 +324,7 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -334,6 +336,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -351,6 +354,7 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -445,14 +449,17 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
        pm_restore_console();
        return error;
@@ -466,6 +473,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
+        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -481,6 +489,7 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -518,6 +527,7 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
index 39ac698ef836..fdcad9ed5a7b 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/hibernate_nvs.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/suspend.h>
 /*
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
                        == NOTIFY_BAD) ? -EINVAL : 0;
 }
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+{
+        return sprintf(buf, "%d\n", pm_async_enabled);
+}
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+                              const char *buf, size_t n)
+{
+        unsigned long val;
+        if (strict_strtoul(buf, 10, &val))
+                return -EINVAL;
+        if (val > 1)
+                return -EINVAL;
+        pm_async_enabled = val;
+        return n;
+}
+power_attr(pm_async);
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
 #endif
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+#ifdef CONFIG_PM_SLEEP
+        &pm_async_attr.attr,
+#ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#endif
        NULL,
 };
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf366..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
                printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
                                "(%d tasks refusing to freeze):\n",
                                elapsed_csecs / 100, elapsed_csecs % 100, todo);
-                show_state();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
                        if (freezing(p) && !freezer_should_skip(p))
-                                printk(KERN_ERR " %s\n", p->comm);
+                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
                } while_each_thread(g, p);
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
                if (nosig_only && should_send_signal(p))
                        continue;
-                if (cgroup_frozen(p))
+                if (cgroup_freezing_or_frozen(p))
                        continue;
                thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..be861c26dda7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
 #include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1181,7 +1182,7 @@ static void free_unnecessary_pages(void)
        memory_bm_position_reset(&copy_bm);
-        while (to_free_normal > 0 && to_free_highmem > 0) {
+        while (to_free_normal > 0 || to_free_highmem > 0) {
                unsigned long pfn = memory_bm_next_pfn(&copy_bm);
                struct page *page = pfn_to_page(pfn);
@@ -1500,7 +1501,7 @@ asmlinkage int swsusp_save(void)
 {
        unsigned int nr_pages, nr_highmem;
-        printk(KERN_INFO "PM: Creating hibernation image: \n");
+        printk(KERN_INFO "PM: Creating hibernation image:\n");
        drain_local_pages(NULL);
        nr_pages = count_data_pages();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..56e7dbb8b996 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/gfp.h>
 #include "power.h"
@@ -189,6 +190,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
+        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
@@ -199,6 +201,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -215,6 +218,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..66824d71983a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
+#include <linux/slab.h>
 #include "power.h"
@@ -657,10 +658,6 @@ int swsusp_read(unsigned int *flags_p)
        struct swsusp_info *header;
        *flags_p = swsusp_header->flags;
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("PM: Image device not initialised\n");
-                return PTR_ERR(resume_bdev);
-        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/kernel/power/swsusp.c
- *
- * This file provides code to write suspend image to swap and read it back.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- * I'd like to thank the following people for their work:
- *
- * Pavel Machek <pavel@ucw.cz>:
- * Modifications, defectiveness pointing, being with me at the very beginning,
- * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
- *
- * Steve Doddi <dirk@loth.demon.co.uk>:
- * Support the possibility of hardware state restoring.
- *
- * Raph <grey.havens@earthling.net>:
- * Support for preserving states of network devices and virtual console
- * (including X and svgatextmode)
- *
- * Kurt Garloff <garloff@suse.de>:
- * Straightened the critical function in order to prevent compilers from
- * playing tricks with local variables.
- *
- * Andreas Mohr <a.mohr@mailto.de>
- *
- * Alex Badea <vampire@go.ro>:
- * Fixed runaway init
- *
- * Rafael J. Wysocki <rjw@sisk.pl>
- * Reworked the freeing of memory and the handling of swap
- *
- * More state savers are welcome. Especially for the scsi layer...
- *
- * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
- */
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
-#include <linux/pm.h>
-#include <linux/swapops.h>
-#include <linux/bootmem.h>
-#include <linux/syscalls.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/rbtree.h>
-#include <linux/io.h>
-#include "power.h"
-int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..a8c96212bc1b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        return res;
 }
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+        if (printk_ratelimit())
+                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+                                "be removed soon, update your suspend-to-disk "
+                                "utilities\n",
+                                __builtin_return_address(0), cmd);
+}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->frozen = 0;
                break;
-        case SNAPSHOT_CREATE_IMAGE:
        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
                        break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->ready = 0;
                break;
-        case SNAPSHOT_PREF_IMAGE_SIZE:
        case SNAPSHOT_SET_IMAGE_SIZE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_AVAIL_SWAP_SIZE:
        case SNAPSHOT_AVAIL_SWAP:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_ALLOC_SWAP_PAGE:
        case SNAPSHOT_GET_SWAP_PAGE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
                        break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                if (!swsusp_swap_in_use()) {
                        /*
                         * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                error = -EINVAL;
                switch (arg) {
@@ -405,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                         * User space encodes device types as two-byte values,
                         * so we need to recode them
                         */
-                        swdev = old_decode_dev(swap_area.dev);
+                        swdev = new_decode_dev(swap_area.dev);
                        if (swdev) {
                                offset = swap_area.offset;
                                data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
 #include <linux/kexec.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
 #include <asm/uaccess.h>
@@ -69,8 +70,6 @@ int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
 };
-static int saved_console_loglevel = -1;
 /*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+static int saved_console_loglevel = -1;
 #ifdef CONFIG_KEXEC
 /*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
 }
 #endif
-/*
+int do_syslog(int type, char __user *buf, int len, bool from_file)
- * Commands to do_syslog:
- *
- *      0 -- Close the log.  Currently a NOP.
- *      1 -- Open the log. Currently a NOP.
- *      2 -- Read from the log.
- *      3 -- Read all messages remaining in the ring buffer.
- *      4 -- Read and clear all messages remaining in the ring buffer
- *      5 -- Clear ring buffer.
- *      6 -- Disable printk's to console
- *      7 -- Enable printk's to console
- *      8 -- Set level of messages printed to console
- *      9 -- Return number of unread characters in the log buffer
- *     10 -- Return size of the log buffer
- */
-int do_syslog(int type, char __user *buf, int len)
 {
        unsigned i, j, limit, count;
        int do_clear = 0;
        char c;
        int error = 0;
-        error = security_syslog(type);
+        error = security_syslog(type, from_file);
        if (error)
                return error;
        switch (type) {
-        case 0:         /* Close log */
+        case SYSLOG_ACTION_CLOSE:       /* Close log */
                break;
-        case 1:         /* Open log */
+        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
-        case 2:         /* Read from log */
+        case SYSLOG_ACTION_READ:        /* Read from log */
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
                if (!error)
                        error = i;
                break;
-        case 4:         /* Read/clear last kernel messages */
+        /* Read/clear last kernel messages */
+        case SYSLOG_ACTION_READ_CLEAR:
                do_clear = 1;
                /* FALL THRU */
-        case 3:         /* Read last kernel messages */
+        /* Read last kernel messages */
+        case SYSLOG_ACTION_READ_ALL:
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
                        }
                }
                break;
-        case 5:         /* Clear ring buffer */
+        /* Clear ring buffer */
+        case SYSLOG_ACTION_CLEAR:
                logged_chars = 0;
                break;
-        case 6:         /* Disable logging to console */
+        /* Disable logging to console */
+        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == -1)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
-        case 7:         /* Enable logging to console */
+        /* Enable logging to console */
+        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != -1) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = -1;
                }
                break;
-        case 8:         /* Set level of messages printed to console */
+        /* Set level of messages printed to console */
+        case SYSLOG_ACTION_CONSOLE_LEVEL:
                error = -EINVAL;
                if (len < 1 || len > 8)
                        goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
                saved_console_loglevel = -1;
                error = 0;
                break;
-        case 9:         /* Number of chars in the log buffer */
+        /* Number of chars in the log buffer */
+        case SYSLOG_ACTION_SIZE_UNREAD:
                error = log_end - log_start;
                break;
-        case 10:        /* Size of the log buffer */
+        /* Size of the log buffer */
+        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
@@ -417,7 +410,7 @@ out:
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-        return do_syslog(type, buf, len);
+        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/regset.h>
 /*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
        return 0;
 }
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static const struct user_regset *
+find_regset(const struct user_regset_view *view, unsigned int type)
+{
+        const struct user_regset *regset;
+        int n;
+        for (n = 0; n < view->n; ++n) {
+                regset = view->regsets + n;
+                if (regset->core_note_type == type)
+                        return regset;
+        }
+        return NULL;
+}
+static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
+                         struct iovec *kiov)
+{
+        const struct user_regset_view *view = task_user_regset_view(task);
+        const struct user_regset *regset = find_regset(view, type);
+        int regset_no;
+        if (!regset || (kiov->iov_len % regset->size) != 0)
+                return -EINVAL;
+        regset_no = regset - view->regsets;
+        kiov->iov_len = min(kiov->iov_len,
+                            (__kernel_size_t) (regset->n * regset->size));
+        if (req == PTRACE_GETREGSET)
+                return copy_regset_to_user(task, view, regset_no, 0,
+                                           kiov->iov_len, kiov->iov_base);
+        else
+                return copy_regset_from_user(task, view, regset_no, 0,
+                                             kiov->iov_len, kiov->iov_base);
+}
+#endif
 int ptrace_request(struct task_struct *child, long request,
                   long addr, long data)
 {
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
                        return 0;
                return ptrace_resume(child, request, SIGKILL);
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct iovec __user *uiov = (struct iovec __user *) data;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(kiov.iov_base, &uiov->iov_base) ||
+                    __get_user(kiov.iov_len, &uiov->iov_len))
+                        return -EFAULT;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                break;
        }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                else
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct compat_iovec __user *uiov =
+                        (struct compat_iovec __user *) datap;
+                compat_uptr_t ptr;
+                compat_size_t len;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(ptr, &uiov->iov_base) ||
+                    __get_user(len, &uiov->iov_len))
+                        return -EFAULT;
+                kiov.iov_base = compat_ptr(ptr);
+                kiov.iov_len = len;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/range.h>
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+        if (start >= end)
+                return nr_range;
+        /* Out of slots: */
+        if (nr_range >= az)
+                return nr_range;
+        range[nr_range].start = start;
+        range[nr_range].end = end;
+        nr_range++;
+        return nr_range;
+}
+int add_range_with_merge(struct range *range, int az, int nr_range,
+                     u64 start, u64 end)
+{
+        int i;
+        if (start >= end)
+                return nr_range;
+        /* Try to merge it with old one: */
+        for (i = 0; i < nr_range; i++) {
+                u64 final_start, final_end;
+                u64 common_start, common_end;
+                if (!range[i].end)
+                        continue;
+                common_start = max(range[i].start, start);
+                common_end = min(range[i].end, end);
+                if (common_start > common_end)
+                        continue;
+                final_start = min(range[i].start, start);
+                final_end = max(range[i].end, end);
+                range[i].start = final_start;
+                range[i].end =  final_end;
+                return nr_range;
+        }
+        /* Need to add it: */
+        return add_range(range, az, nr_range, start, end);
+}
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+        int i, j;
+        if (start >= end)
+                return;
+        for (j = 0; j < az; j++) {
+                if (!range[j].end)
+                        continue;
+                if (start <= range[j].start && end >= range[j].end) {
+                        range[j].start = 0;
+                        range[j].end = 0;
+                        continue;
+                }
+                if (start <= range[j].start && end < range[j].end &&
+                    range[j].start < end) {
+                        range[j].start = end;
+                        continue;
+                }
+                if (start > range[j].start && end >= range[j].end &&
+                    range[j].end > start) {
+                        range[j].end = start;
+                        continue;
+                }
+                if (start > range[j].start && end < range[j].end) {
+                        /* Find the new spare: */
+                        for (i = 0; i < az; i++) {
+                                if (range[i].end == 0)
+                                        break;
+                        }
+                        if (i < az) {
+                                range[i].end = range[j].end;
+                                range[i].start = end;
+                        } else {
+                                printk(KERN_ERR "run of slot in ranges\n");
+                        }
+                        range[j].end = start;
+                        continue;
+                }
+        }
+}
+static int cmp_range(const void *x1, const void *x2)
+{
+        const struct range *r1 = x1;
+        const struct range *r2 = x2;
+        s64 start1, start2;
+        start1 = r1->start;
+        start2 = r2->start;
+        return start1 - start2;
+}
+int clean_sort_range(struct range *range, int az)
+{
+        int i, j, k = az - 1, nr_range = 0;
+        for (i = 0; i < k; i++) {
+                if (range[i].end)
+                        continue;
+                for (j = k; j > i; j--) {
+                        if (range[j].end) {
+                                k = j;
+                                break;
+                        }
+                }
+                if (j == i)
+                        break;
+                range[i].start = range[k].start;
+                range[i].end   = range[k].end;
+                range[k].start = 0;
+                range[k].end   = 0;
+                k--;
+        }
+        /* count it */
+        for (i = 0; i < az; i++) {
+                if (!range[i].end) {
+                        nr_range = i;
+                        break;
+                }
+        }
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+        return nr_range;
+}
+void sort_range(struct range *range, int nr_range)
+{
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..49d808e833b0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,73 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/kernel_stat.h>
+#include <linux/hardirq.h>
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
 EXPORT_SYMBOL_GPL(rcu_lock_map);
+static struct lock_class_key rcu_bh_lock_key;
+struct lockdep_map rcu_bh_lock_map =
+        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
+EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
+static struct lock_class_key rcu_sched_lock_key;
+struct lockdep_map rcu_sched_lock_map =
+        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
+EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 #endif
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int debug_lockdep_rcu_enabled(void)
+{
+        return rcu_scheduler_active && debug_locks &&
+               current->lockdep_recursion == 0;
+}
+EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+/**
+ * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ *
+ * Check for bottom half being disabled, which covers both the
+ * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
+ * will show the situation.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ */
+int rcu_read_lock_bh_held(void)
+{
+        if (!debug_lockdep_rcu_enabled())
+                return 1;
+        return in_softirq();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process.  Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system).  After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections.
+ */
+void rcu_scheduler_starting(void)
+{
+        WARN_ON(num_online_cpus() != 1);
+        WARN_ON(nr_context_switches() > 0);
+        rcu_scheduler_active = 1;
+}
 /*
 * Awaken the corresponding synchronize_rcu() instance now that a
 * grace period has elapsed.
@@ -63,3 +122,14 @@ void wakeme_after_rcu(struct rcu_head  *head)
        rcu = container_of(head, struct rcu_synchronize, head);
        complete(&rcu->completion);
 }
+#ifdef CONFIG_PROVE_RCU
+/*
+ * wrapper function to avoid #include problems.
+ */
+int rcu_my_thread_group_empty(void)
+{
+        return thread_group_empty(current);
+}
+EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
+#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;         /* Start/stop testing interval (in sec) */
 static int irqreader = 1;       /* RCU readers from irq (timers). */
+static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
+static int fqs_holdoff = 0;     /* Hold time within burst (us). */
+static int fqs_stutter = 3;     /* Wait time between bursts (s). */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
 module_param(irqreader, int, 0444);
 MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+module_param(fqs_duration, int, 0444);
+MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
+module_param(fqs_holdoff, int, 0444);
+MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+module_param(fqs_stutter, int, 0444);
+MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
+static struct task_struct *fqs_task;
 #define RCU_TORTURE_PIPE_LEN 10
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
        void (*cb_barrier)(void);
+        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
        char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
        .cb_barrier     = rcu_barrier,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu,
        .cb_barrier     = NULL,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu_expedited,
        .cb_barrier     = NULL,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_expedited"
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = rcu_bh_torture_synchronize,
        .cb_barrier     = rcu_barrier_bh,
+        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_bh"
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = rcu_bh_torture_synchronize,
        .cb_barrier     = NULL,
+        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_bh_sync"
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = {
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = rcu_barrier_sched,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "sched"
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = NULL,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
        .name           = "sched_sync"
 };
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_sched_expedited,
        .cb_barrier     = NULL,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = rcu_expedited_torture_stats,
        .irq_capable    = 1,
        .name           = "sched_expedited"
 };
 /*
+ * RCU torture force-quiescent-state kthread.  Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+        unsigned long fqs_resume_time;
+        int fqs_burst_remaining;
+        VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+        do {
+                fqs_resume_time = jiffies + fqs_stutter * HZ;
+                while (jiffies - fqs_resume_time > LONG_MAX) {
+                        schedule_timeout_interruptible(1);
+                }
+                fqs_burst_remaining = fqs_duration;
+                while (fqs_burst_remaining > 0) {
+                        cur_ops->fqs();
+                        udelay(fqs_holdoff);
+                        fqs_burst_remaining -= fqs_holdoff;
+                }
+                rcu_stutter_wait("rcu_torture_fqs");
+        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_fqs");
+        while (!kthread_should_stop())
+                schedule_timeout_uninterruptible(1);
+        return 0;
+}
+/*
 * RCU torture writer kthread.  Repeatedly substitutes a new structure
 * for that pointed to by rcu_torture_current, freeing the old structure
 * after a series of grace periods (the "pipeline").
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
        idx = cur_ops->readlock();
        completed = cur_ops->completed();
-        p = rcu_dereference(rcu_torture_current);
+        p = rcu_dereference_check(rcu_torture_current,
+                                  rcu_read_lock_held() ||
+                                  rcu_read_lock_bh_held() ||
+                                  rcu_read_lock_sched_held() ||
+                                  srcu_read_lock_held(&srcu_ctl));
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
        }
-        __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+        __this_cpu_inc(rcu_torture_batch[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
 }
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg)
        do {
                if (irqreader && cur_ops->irq_capable) {
                        if (!timer_pending(&t))
-                                mod_timer(&t, 1);
+                                mod_timer(&t, jiffies + 1);
                }
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
-                p = rcu_dereference(rcu_torture_current);
+                p = rcu_dereference_check(rcu_torture_current,
+                                          rcu_read_lock_held() ||
+                                          rcu_read_lock_bh_held() ||
+                                          rcu_read_lock_sched_held() ||
+                                          srcu_read_lock_held(&srcu_ctl));
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
-                __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+                __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
-                "shuffle_interval=%d stutter=%d irqreader=%d\n",
+                "shuffle_interval=%d stutter=%d irqreader=%d "
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                stutter, irqreader);
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
 }
 static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void)
        }
        stats_task = NULL;
+        if (fqs_task) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
+                kthread_stop(fqs_task);
+        }
+        fqs_task = NULL;
        /* Wait for all RCU callbacks to fire.  */
        if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1220,11 @@ rcu_torture_init(void)
                mutex_unlock(&fullstop_mutex);
                return -EINVAL;
        }
+        if (cur_ops->fqs == NULL && fqs_duration != 0) {
+                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+                                  "fqs_duration, fqs disabled.\n");
+                fqs_duration = 0;
+        }
        if (cur_ops->init)
                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1282,6 +1353,19 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
+        if (fqs_duration < 0)
+                fqs_duration = 0;
+        if (fqs_duration) {
+                /* Create the stutter thread */
+                fqs_task = kthread_run(rcu_torture_fqs, NULL,
+                                       "rcu_torture_fqs");
+                if (IS_ERR(fqs_task)) {
+                        firsterr = PTR_ERR(fqs_task);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
+                        fqs_task = NULL;
+                        goto unwind;
+                }
+        }
        register_reboot_notifier(&rcutorture_nb);
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,7 +46,6 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/time.h>
-#include <linux/kernel_stat.h>
 #include "rcutree.h"
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .signaled = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-        .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
        .orphan_cbs_list = NULL, \
        .orphan_cbs_tail = &name.orphan_cbs_list, \
        .orphan_qlen = 0, \
-        .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
 }
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
-static int rcu_scheduler_active __read_mostly;
 /*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 /*
+ * Force a quiescent state for RCU BH.
+ */
+void rcu_bh_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_bh_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_sched_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+/*
 * Does the CPU have callbacks ready to be invoked?
 */
 static int
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        /* Only let one CPU complain about others per time interval. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        delta = jiffies - rsp->jiffies_stall;
        if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * due to CPU offlining.
         */
        rcu_print_task_stall(rnp);
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /* OK, time to rat on our buddy... */
        printk(KERN_ERR "INFO: RCU detected CPU stalls:");
        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                rcu_print_task_stall(rnp);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                if (rnp->qsmask == 0)
                        continue;
                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
               smp_processor_id(), (long)(jiffies - rsp->gp_start));
        trigger_all_cpu_backtrace();
+        /* If so configured, complain about tasks blocking the grace period. */
+        rcu_print_detail_task_stall(rsp);
        force_quiescent_state(rsp, 0);  /* Kick them all. */
 }
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
                        smp_processor_id(), jiffies - rsp->gp_start);
        trigger_all_cpu_backtrace();
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
                rsp->jiffies_stall =
                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        set_need_resched();  /* kick ourselves to get things going. */
 }
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
-            !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
        }
        __note_new_gpnum(rsp, rnp, rdp);
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
-            !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
        }
        __rcu_process_gp_end(rsp, rnp, rdp);
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!cpu_needs_another_gp(rsp, rdp)) {
+        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+                if (cpu_needs_another_gp(rsp, rdp))
+                        rsp->fqs_need_gp = 1;
                if (rnp->completed == rsp->completed) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
-                spin_unlock(&rnp->lock);         /* irqs remain disabled. */
+                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
                /*
                 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                 * of the next grace period to process their callbacks.
                 */
                rcu_for_each_node_breadth_first(rsp, rnp) {
-                        spin_lock(&rnp->lock);   /* irqs already disabled. */
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
                        rnp->completed = rsp->completed;
-                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
                local_irq_restore(flags);
                return;
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->completed = rsp->completed;
                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+        raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
        /* Exclude any concurrent CPU-hotplug operations. */
-        spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
        /*
         * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
         * irqs disabled.
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
-                spin_lock(&rnp->lock);          /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
                rnp->completed = rsp->completed;
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
-                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
+                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
        rnp = rcu_get_root(rsp);
-        spin_lock(&rnp->lock);                  /* irqs already disabled. */
+        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
        rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
-        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                if (!(rnp->qsmask & mask)) {
                        /* Our bit has already been cleared, so done. */
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
                rnp->qsmask &= ~mask;
                if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
                        /* Other bits still set at this level, so done. */
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
                mask = rnp->grpmask;
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        break;
                }
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                rnp_c = rnp;
                rnp = rnp->parent;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                WARN_ON_ONCE(rnp_c->qsmask);
        }
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
        struct rcu_node *rnp;
        rnp = rdp->mynode;
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (lastcomp != rnp->completed) {
                /*
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
                 * race occurred.
                 */
                rdp->passed_quiesc = 0; /* try again later! */
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
        mask = rdp->grpmask;
        if ((rnp->qsmask & mask) == 0) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
                rdp->qs_pending = 0;
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-        spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
        *rsp->orphan_cbs_tail = rdp->nxtlist;
        rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
        rdp->nxtlist = NULL;
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
                rdp->nxttail[i] = &rdp->nxtlist;
        rsp->orphan_qlen += rdp->qlen;
        rdp->qlen = 0;
-        spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
+        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
 /*
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        unsigned long flags;
        struct rcu_data *rdp;
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        rdp = rsp->rda[smp_processor_id()];
        if (rsp->orphan_cbs_list == NULL) {
-                spin_unlock_irqrestore(&rsp->onofflock, flags);
+                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
                return;
        }
        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        rsp->orphan_cbs_list = NULL;
        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
        rsp->orphan_qlen = 0;
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        struct rcu_node *rnp;
        /* Exclude any attempts to start a new grace period. */
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
        rnp = rdp->mynode;      /* this is the outgoing CPU's rnp. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
-                spin_lock(&rnp->lock);          /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rnp->qsmaskinit &= ~mask;
                if (rnp->qsmaskinit != 0) {
                        if (rnp != rdp->mynode)
-                                spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        break;
                }
                if (rnp == rdp->mynode)
                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
                else
-                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                mask = rnp->grpmask;
                rnp = rnp->parent;
        } while (rnp != NULL);
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
         * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
         * held leads to deadlock.
         */
-        spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+        raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
        rnp = rdp->mynode;
        if (need_report & RCU_OFL_TASKS_NORM_GP)
                rcu_report_unblock_qs_rnp(rnp, flags);
        else
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
 /*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
- * Returns 1 if the current grace period ends while scanning (possibly
+ * The caller must have suppressed start of new grace periods.
- * because we made it end).
 */
-static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
-                               int (*f)(struct rcu_data *))
 {
        unsigned long bit;
        int cpu;
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
        rcu_for_each_leaf_node(rsp, rnp) {
                mask = 0;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
-                if (rnp->completed != lastcomp) {
+                if (!rcu_gp_in_progress(rsp)) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        return 1;
+                        return;
                }
                if (rnp->qsmask == 0) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        continue;
                }
                cpu = rnp->grplo;
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
                        if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
                                mask |= bit;
                }
-                if (mask != 0 && rnp->completed == lastcomp) {
+                if (mask != 0) {
                        /* rcu_report_qs_rnp() releases rnp->lock. */
                        rcu_report_qs_rnp(mask, rsp, rnp, flags);
                        continue;
                }
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        return 0;
 }
 /*
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
        unsigned long flags;
-        long lastcomp;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        u8 signaled;
-        u8 forcenow;
        if (!rcu_gp_in_progress(rsp))
                return;  /* No grace period in progress, nothing to force. */
-        if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+        if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
                return; /* Someone else is already on the job. */
        }
-        if (relaxed &&
+        if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
-            (long)(rsp->jiffies_force_qs - jiffies) >= 0)
+                goto unlock_fqs_ret; /* no emergency and done recently. */
-                goto unlock_ret; /* no emergency and done recently. */
        rsp->n_force_qs++;
-        spin_lock(&rnp->lock);
+        raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-        lastcomp = rsp->gpnum - 1;
-        signaled = rsp->signaled;
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        if(!rcu_gp_in_progress(rsp)) {
                rsp->n_force_qs_ngp++;
-                spin_unlock(&rnp->lock);
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-                goto unlock_ret;  /* no GP in progress, time updated. */
+                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
-        spin_unlock(&rnp->lock);
+        rsp->fqs_active = 1;
-        switch (signaled) {
+        switch (rsp->signaled) {
        case RCU_GP_IDLE:
        case RCU_GP_INIT:
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
        case RCU_SAVE_DYNTICK:
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
                if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
                        break; /* So gcc recognizes the dead code. */
                /* Record dyntick-idle state. */
-                if (rcu_process_dyntick(rsp, lastcomp,
+                force_qs_rnp(rsp, dyntick_save_progress_counter);
-                                        dyntick_save_progress_counter))
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-                        goto unlock_ret;
+                if (rcu_gp_in_progress(rsp))
-                /* fall into next case. */
-        case RCU_SAVE_COMPLETED:
-                /* Update state, record completion counter. */
-                forcenow = 0;
-                spin_lock(&rnp->lock);
-                if (lastcomp + 1 == rsp->gpnum &&
-                    lastcomp == rsp->completed &&
-                    rsp->signaled == signaled) {
                        rsp->signaled = RCU_FORCE_QS;
-                        rsp->completed_fqs = lastcomp;
+                break;
-                        forcenow = signaled == RCU_SAVE_COMPLETED;
-                }
-                spin_unlock(&rnp->lock);
-                if (!forcenow)
-                        break;
-                /* fall into next case. */
        case RCU_FORCE_QS:
                /* Check dyntick-idle state, send IPI to laggarts. */
-                if (rcu_process_dyntick(rsp, rsp->completed_fqs,
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-                                        rcu_implicit_dynticks_qs))
+                force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
-                        goto unlock_ret;
                /* Leave state in case more forcing is required. */
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                break;
        }
-unlock_ret:
+        rsp->fqs_active = 0;
-        spin_unlock_irqrestore(&rsp->fqslock, flags);
+        if (rsp->fqs_need_gp) {
+                raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
+                rsp->fqs_need_gp = 0;
+                rcu_start_gp(rsp, flags); /* releases rnp->lock */
+                return;
+        }
+        raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+unlock_fqs_ret:
+        raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
 }
 #else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
         * If an RCU GP has gone long enough, go check for dyntick
         * idle CPUs and, if needed, send resched IPIs.
         */
-        if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+        if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        /*
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Does this CPU require a not-yet-started grace period? */
        if (cpu_needs_another_gp(rsp, rdp)) {
-                spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+                raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
                rcu_start_gp(rsp, flags);  /* releases above lock */
        }
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
         * grace-period manipulations above.
         */
        smp_mb(); /* See above block comment. */
+        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
+        rcu_needs_cpu_flush();
 }
 static void
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                unsigned long nestflag;
                struct rcu_node *rnp_root = rcu_get_root(rsp);
-                spin_lock_irqsave(&rnp_root->lock, nestflag);
+                raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
        }
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                        force_quiescent_state(rsp, 0);
                rdp->n_force_qs_snap = rsp->n_force_qs;
                rdp->qlen_last_fqs_check = rdp->qlen;
-        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
 }
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Has an RCU GP gone long enough to send resched IPIs &c? */
        if (rcu_gp_in_progress(rsp) &&
-            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+            ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
                rdp->n_rp_need_fqs++;
                return 1;
        }
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu)
 /*
 * Check to see if any future RCU-related work will need to be done
 * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * 1 if so.
- * an exported member of the RCU API.
 */
-int rcu_needs_cpu(int cpu)
+static int rcu_needs_cpu_quick_check(int cpu)
 {
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu)
               rcu_preempt_needs_cpu(cpu);
 }
-/*
- * This function is invoked towards the end of the scheduler's initialization
- * process.  Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system).  After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections.
- */
-void rcu_scheduler_starting(void)
-{
-        WARN_ON(num_online_cpus() != 1);
-        WARN_ON(nr_context_switches() > 0);
-        rcu_scheduler_active = 1;
-}
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 #endif /* #ifdef CONFIG_NO_HZ */
        rdp->cpu = cpu;
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
-        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        /*
         * A new grace period might start here.  If so, we won't be part
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
         */
        /* Exclude any attempts to start a new GP on large systems. */
-        spin_lock(&rsp->onofflock);             /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);         /* irqs already disabled. */
        /* Add CPU to rcu_node bitmasks. */
        rnp = rdp->mynode;
        mask = rdp->grpmask;
        do {
                /* Exclude any attempts to start a new GP on small systems. */
-                spin_lock(&rnp->lock);  /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rnp->qsmaskinit |= mask;
                mask = rnp->grpmask;
                if (rnp == rdp->mynode) {
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
                        rdp->completed = rnp->completed;
                        rdp->passed_quiesc_completed = rnp->completed - 1;
                }
-                spin_unlock(&rnp->lock); /* irqs already disabled. */
+                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
        } while (rnp != NULL && !(rnp->qsmaskinit & mask));
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 */
 static void __init rcu_init_one(struct rcu_state *rsp)
 {
+        static char *buf[] = { "rcu_node_level_0",
+                               "rcu_node_level_1",
+                               "rcu_node_level_2",
+                               "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
        int cpustride = 1;
        int i;
        int j;
        struct rcu_node *rnp;
+        BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
        /* Initialize the level-tracking arrays. */
        for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                cpustride *= rsp->levelspread[i];
                rnp = rsp->level[i];
                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
-                        spin_lock_init(&rnp->lock);
+                        raw_spin_lock_init(&rnp->lock);
-                        lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
+                        lockdep_set_class_and_name(&rnp->lock,
+                                                   &rcu_node_class[i], buf[i]);
                        rnp->gpnum = 0;
                        rnp->qsmask = 0;
                        rnp->qsmaskinit = 0;
@@ -1876,7 +1876,7 @@ do { \
 void __init rcu_init(void)
 {
-        int i;
+        int cpu;
        rcu_bootup_announce();
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -1896,8 +1896,8 @@ void __init rcu_init(void)
         * or the scheduler are operational.
         */
        cpu_notifier(rcu_cpu_notify, 0);
-        for_each_online_cpu(i)
+        for_each_online_cpu(cpu)
-                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
+                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 }
 #include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
 * Definition for node within the RCU grace-period-detection hierarchy.
 */
 struct rcu_node {
-        spinlock_t lock;        /* Root rcu_node's lock protects some */
+        raw_spinlock_t lock;    /* Root rcu_node's lock protects some */
                                /*  rcu_state fields as well as following. */
-        long    gpnum;          /* Current grace period for this node. */
+        unsigned long gpnum;    /* Current grace period for this node. */
                                /*  This will either be equal to or one */
                                /*  behind the root rcu_node's gpnum. */
-        long    completed;      /* Last grace period completed for this node. */
+        unsigned long completed; /* Last GP completed for this node. */
                                /*  This will either be equal to or one */
                                /*  behind the root rcu_node's gpnum. */
        unsigned long qsmask;   /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
        /* 1) quiescent-state and grace-period handling : */
-        long            completed;      /* Track rsp->completed gp number */
+        unsigned long   completed;      /* Track rsp->completed gp number */
                                        /*  in order to detect GP end. */
-        long            gpnum;          /* Highest gp number that this CPU */
+        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
-        long            passed_quiesc_completed;
+        unsigned long   passed_quiesc_completed;
                                        /* Value of completed at time of qs. */
        bool            passed_quiesc;  /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
@@ -221,14 +221,14 @@ struct rcu_data {
        unsigned long resched_ipi;      /* Sent a resched IPI. */
        /* 5) __rcu_pending() statistics. */
-        long n_rcu_pending;             /* rcu_pending() calls since boot. */
+        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
-        long n_rp_qs_pending;
+        unsigned long n_rp_qs_pending;
-        long n_rp_cb_ready;
+        unsigned long n_rp_cb_ready;
-        long n_rp_cpu_needs_gp;
+        unsigned long n_rp_cpu_needs_gp;
-        long n_rp_gp_completed;
+        unsigned long n_rp_gp_completed;
-        long n_rp_gp_started;
+        unsigned long n_rp_gp_started;
-        long n_rp_need_fqs;
+        unsigned long n_rp_need_fqs;
-        long n_rp_need_nothing;
+        unsigned long n_rp_need_nothing;
        int cpu;
 };
@@ -237,25 +237,36 @@ struct rcu_data {
 #define RCU_GP_IDLE             0       /* No grace period in progress. */
 #define RCU_GP_INIT             1       /* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK        2       /* Need to scan dyntick state. */
-#define RCU_SAVE_COMPLETED      3       /* Need to save rsp->completed. */
+#define RCU_FORCE_QS            3       /* Need to force quiescent state. */
-#define RCU_FORCE_QS            4       /* Need to force quiescent state. */
 #ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
 #else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT         RCU_SAVE_COMPLETED
+#define RCU_SIGNAL_INIT         RCU_FORCE_QS
 #endif /* #else #ifdef CONFIG_NO_HZ */
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_RAT_DELAY             2         /* Allow other CPUs time */
+#define RCU_STALL_DELAY_DELTA          (5 * HZ)
-                                                  /*  to take at least one */
+#else
-                                                  /*  scheduling clock irq */
+#define RCU_STALL_DELAY_DELTA          0
-                                                  /*  before ratting on them. */
+#endif
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
+                                                /*  to take at least one */
+                                                /*  scheduling clock irq */
+                                                /*  before ratting on them. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b)      (ULONG_MAX / 2 < (a) - (b))
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
 * represented in "heap" form in a dense array.  The root (first level)
@@ -277,12 +288,19 @@ struct rcu_state {
        u8      signaled ____cacheline_internodealigned_in_smp;
                                                /* Force QS state. */
-        long    gpnum;                          /* Current gp number. */
+        u8      fqs_active;                     /* force_quiescent_state() */
-        long    completed;                      /* # of last completed gp. */
+                                                /*  is running. */
+        u8      fqs_need_gp;                    /* A CPU was prevented from */
+                                                /*  starting a new grace */
+                                                /*  period because */
+                                                /*  force_quiescent_state() */
+                                                /*  was running. */
+        unsigned long gpnum;                    /* Current gp number. */
+        unsigned long completed;                /* # of last completed gp. */
        /* End of fields guarded by root rcu_node's lock. */
-        spinlock_t onofflock;                   /* exclude on/offline and */
+        raw_spinlock_t onofflock;               /* exclude on/offline and */
                                                /*  starting new GP.  Also */
                                                /*  protects the following */
                                                /*  orphan_cbs fields. */
@@ -292,10 +310,8 @@ struct rcu_state {
                                                /*  going offline. */
        struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
        long orphan_qlen;                       /* Number of orphaned cbs. */
-        spinlock_t fqslock;                     /* Only one task forcing */
+        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
-        long    completed_fqs;                  /* Value of completed @ snap. */
-                                                /*  Protected by fqslock. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
        unsigned long n_force_qs;               /* Number of calls to */
@@ -319,8 +335,6 @@ struct rcu_state {
 #define RCU_OFL_TASKS_EXP_GP    0x2             /* Tasks blocking expedited */
                                                /*  GP were moved to root. */
-#ifdef RCU_TREE_NONCORE
 /*
 * RCU implementation internal declarations:
 */
@@ -335,7 +349,7 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#else /* #ifdef RCU_TREE_NONCORE */
+#ifndef RCU_TREE_NONCORE
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
@@ -347,6 +361,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
                                      unsigned long flags);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +382,6 @@ static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_orphanage(void);
 static void __init __rcu_init_preempt(void);
+static void rcu_needs_cpu_flush(void);
-#endif /* #else #ifdef RCU_TREE_NONCORE */
+#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -62,6 +62,15 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
+ * Force a quiescent state for preemptible RCU.
+ */
+void rcu_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_preempt_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+/*
 * Record a preemptable-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                /* Possibly blocking in an RCU read-side critical section. */
                rdp = rcu_preempt_state.rda[cpu];
                rnp = rdp->mynode;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
                t->rcu_blocked_node = rnp;
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
                phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
                list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        /*
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
        struct rcu_node *rnp_p;
        if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;  /* Still need more quiescent states! */
        }
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
        /* Report up the rest of the hierarchy. */
        mask = rnp->grpmask;
-        spin_unlock(&rnp->lock);        /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
-        spin_lock(&rnp_p->lock);        /* irqs already disabled. */
+        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
        rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 }
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                for (;;) {
                        rnp = t->rcu_blocked_node;
-                        spin_lock(&rnp->lock);  /* irqs already disabled. */
+                        raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
                        if (rnp == t->rcu_blocked_node)
                                break;
-                        spin_unlock(&rnp->lock);  /* irqs remain disabled. */
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
                empty = !rcu_preempted_readers(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
                 */
                if (empty)
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                else
                        rcu_report_unblock_qs_rnp(rnp, flags);
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void)
        if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+        WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+        unsigned long flags;
+        struct list_head *lp;
+        int phase;
+        struct task_struct *t;
+        if (rcu_preempted_readers(rnp)) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                phase = rnp->gpnum & 0x1;
+                lp = &rnp->blocked_tasks[phase];
+                list_for_each_entry(t, lp, rcu_node_entry)
+                        sched_show_task(t);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        }
+}
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        rcu_print_detail_task_stall_rnp(rnp);
+        rcu_for_each_leaf_node(rsp, rnp)
+                rcu_print_detail_task_stall_rnp(rnp);
+}
+#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 /*
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
 */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
-        unsigned long flags;
        struct list_head *lp;
        int phase;
        struct task_struct *t;
        if (rcu_preempted_readers(rnp)) {
-                spin_lock_irqsave(&rnp->lock, flags);
                phase = rnp->gpnum & 0x1;
                lp = &rnp->blocked_tasks[phase];
                list_for_each_entry(t, lp, rcu_node_entry)
                        printk(" P%d", t->pid);
-                spin_unlock_irqrestore(&rnp->lock, flags);
        }
 }
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                lp_root = &rnp_root->blocked_tasks[i];
                while (!list_empty(lp)) {
                        tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
-                        spin_lock(&rnp_root->lock); /* irqs already disabled */
+                        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
                        list_del(&tp->rcu_node_entry);
                        tp->rcu_blocked_node = rnp_root;
                        list_add(&tp->rcu_node_entry, lp_root);
-                        spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+                        raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
                }
        }
        return retval;
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
        unsigned long flags;
        unsigned long mask;
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        for (;;) {
                if (!sync_rcu_preempt_exp_done(rnp))
                        break;
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
                        break;
                }
                mask = rnp->grpmask;
-                spin_unlock(&rnp->lock); /* irqs remain disabled */
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
                rnp = rnp->parent;
-                spin_lock(&rnp->lock); /* irqs already disabled */
+                raw_spin_lock(&rnp->lock); /* irqs already disabled */
                rnp->expmask &= ~mask;
        }
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
        int must_wait;
-        spin_lock(&rnp->lock); /* irqs already disabled */
+        raw_spin_lock(&rnp->lock); /* irqs already disabled */
        list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
        list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
        must_wait = rcu_preempted_readers_exp(rnp);
-        spin_unlock(&rnp->lock); /* irqs remain disabled */
+        raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
        if (!must_wait)
                rcu_report_exp_rnp(rsp, rnp);
 }
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void)
        /* force all RCU readers onto blocked_tasks[]. */
        synchronize_sched_expedited();
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Initialize ->expmask for all non-leaf rcu_node structures. */
        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
-                spin_lock(&rnp->lock); /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
                rnp->expmask = rnp->qsmaskinit;
-                spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
        /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void)
        if (NUM_RCU_NODES > 1)
                sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
        /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
        rnp = rcu_get_root(rsp);
@@ -713,6 +766,16 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
+ * Force a quiescent state for RCU, which, because there is no preemptible
+ * RCU, becomes the same as rcu-sched.
+ */
+void rcu_force_quiescent_state(void)
+{
+        rcu_sched_force_quiescent_state();
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+/*
 * Because preemptable RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 /* Because preemptible RCU does not exist, no quieting of tasks. */
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 {
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 * Because preemptable RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
@@ -884,3 +955,115 @@ static void __init __rcu_init_preempt(void)
 }
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we have preemptible RCU, just check whether this CPU needs
+ * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
+ * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        return rcu_needs_cpu_quick_check(cpu);
+}
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
+ * entry is not configured, so we never do need to.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+}
+#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+#define RCU_NEEDS_CPU_FLUSHES 5
+static DEFINE_PER_CPU(int, rcu_dyntick_drain);
+static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we are not supporting preemptible RCU, attempt to accelerate
+ * any current grace periods so that RCU no longer needs this CPU, but
+ * only if all other CPUs are already in dynticks-idle mode.  This will
+ * allow the CPU cores to be powered down immediately, as opposed to after
+ * waiting many milliseconds for grace periods to elapse.
+ *
+ * Because it is not legal to invoke rcu_process_callbacks() with irqs
+ * disabled, we do one pass of force_quiescent_state(), then do a
+ * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
+ * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        int c = 0;
+        int thatcpu;
+        /* Check for being in the holdoff period. */
+        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+                return rcu_needs_cpu_quick_check(cpu);
+        /* Don't bother unless we are the last non-dyntick-idle CPU. */
+        for_each_cpu_not(thatcpu, nohz_cpu_mask)
+                if (thatcpu != cpu) {
+                        per_cpu(rcu_dyntick_drain, cpu) = 0;
+                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                        return rcu_needs_cpu_quick_check(cpu);
+                }
+        /* Check and update the rcu_dyntick_drain sequencing. */
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+                /* First time through, initialize the counter. */
+                per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+                /* We have hit the limit, so time to give up. */
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                return rcu_needs_cpu_quick_check(cpu);
+        }
+        /* Do one step pushing remaining RCU callbacks through. */
+        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+                rcu_sched_qs(cpu);
+                force_quiescent_state(&rcu_sched_state, 0);
+                c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+        }
+        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+                rcu_bh_qs(cpu);
+                force_quiescent_state(&rcu_bh_state, 0);
+                c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+        }
+        /* If RCU callbacks are still pending, RCU still needs this CPU. */
+        if (c)
+                raise_softirq(RCU_SOFTIRQ);
+        return c;
+}
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+        int cpu = smp_processor_id();
+        unsigned long flags;
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
+                return;
+        local_irq_save(flags);
+        (void)rcu_needs_cpu(cpu);
+        local_irq_restore(flags);
+}
+#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
+        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
+        seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
                   rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
-        long gpnum;
+        unsigned long gpnum;
        int level = 0;
        int phase;
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
-        seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
 static int show_rcugp(struct seq_file *m, void *unused)
 {
 #ifdef CONFIG_TREE_PREEMPT_RCU
-        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%lu\n",
                   rcu_preempt_state.completed, rcu_preempt_state.gpnum);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%lu\n",
                   rcu_sched_state.completed, rcu_sched_state.gpnum);
-        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%lu\n",
                   rcu_bh_state.completed, rcu_bh_state.gpnum);
        return 0;
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
 /*
 *      subbuf_splice_actor - splice up to one subbuf's worth of data
 */
-static int subbuf_splice_actor(struct file *in,
+static ssize_t subbuf_splice_actor(struct file *in,
                               loff_t *ppos,
                               struct pipe_inode_info *pipe,
                               size_t len,
                               unsigned int flags,
                               int *nonpad_ret)
 {
-        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
+        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
        struct rchan_buf *rbuf = in->private_data;
        unsigned int subbuf_size = rbuf->chan->subbuf_size;
        uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
                .ops = &relay_pipe_buf_ops,
                .spd_release = relay_page_release,
        };
+        ssize_t ret;
        if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
                return 0;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..9c358e263534 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,20 +188,65 @@ static int __release_resource(struct resource *old)
        return -EINVAL;
 }
+static void __release_child_resources(struct resource *r)
+{
+        struct resource *tmp, *p;
+        resource_size_t size;
+        p = r->child;
+        r->child = NULL;
+        while (p) {
+                tmp = p;
+                p = p->sibling;
+                tmp->parent = NULL;
+                tmp->sibling = NULL;
+                __release_child_resources(tmp);
+                printk(KERN_DEBUG "release child resource %pR\n", tmp);
+                /* need to restore size, and keep flags */
+                size = resource_size(tmp);
+                tmp->start = 0;
+                tmp->end = size - 1;
+        }
+}
+void release_child_resources(struct resource *r)
+{
+        write_lock(&resource_lock);
+        __release_child_resources(r);
+        write_unlock(&resource_lock);
+}
 /**
- * request_resource - request and reserve an I/O or memory resource
+ * request_resource_conflict - request and reserve an I/O or memory resource
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 *
- * Returns 0 for success, negative error code on error.
+ * Returns 0 for success, conflict resource on error.
 */
-int request_resource(struct resource *root, struct resource *new)
+struct resource *request_resource_conflict(struct resource *root, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __request_resource(root, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = request_resource_conflict(root, new);
        return conflict ? -EBUSY : 0;
 }
@@ -274,7 +319,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
        struct resource res;
-        unsigned long pfn, len;
+        unsigned long pfn, end_pfn;
        u64 orig_end;
        int ret = -1;
@@ -284,9 +329,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        orig_end = res.end;
        while ((res.start < res.end) &&
                (find_next_system_ram(&res, "System RAM") >= 0)) {
-                pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
+                end_pfn = (res.end + 1) >> PAGE_SHIFT;
-                ret = (*func)(pfn, len, arg);
+                if (end_pfn > pfn)
+                        ret = (*func)(pfn, end_pfn - pfn, arg);
                if (ret)
                        break;
                res.start = res.end + 1;
@@ -297,14 +343,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 #endif
+static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+        return 1;
+}
+/*
+ * This generic page_is_ram() returns true if specified address is
+ * registered as "System RAM" in iomem_resource list.
+ */
+int __weak page_is_ram(unsigned long pfn)
+{
+        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
+}
 /*
 * Find empty slot in the resource tree given range and alignment.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
                         resource_size_t max, resource_size_t align,
-                         void (*alignf)(void *, struct resource *,
+                         resource_size_t (*alignf)(void *,
-                                        resource_size_t, resource_size_t),
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
                         void *alignf_data)
 {
        struct resource *this = root->child;
@@ -330,7 +391,7 @@ static int find_resource(struct resource *root, struct resource *new,
                        tmp.end = max;
                tmp.start = ALIGN(tmp.start, align);
                if (alignf)
-                        alignf(alignf_data, &tmp, size, align);
+                        tmp.start = alignf(alignf_data, &tmp, size, align);
                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
                        new->start = tmp.start;
                        new->end = tmp.start + size - 1;
@@ -358,8 +419,10 @@ static int find_resource(struct resource *root, struct resource *new,
 int allocate_resource(struct resource *root, struct resource *new,
                      resource_size_t size, resource_size_t min,
                      resource_size_t max, resource_size_t align,
-                      void (*alignf)(void *, struct resource *,
+                      resource_size_t (*alignf)(void *,
-                                     resource_size_t, resource_size_t),
+                                                const struct resource *,
+                                                resource_size_t,
+                                                resource_size_t),
                      void *alignf_data)
 {
        int err;
@@ -426,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
 }
 /**
- * insert_resource - Inserts a resource in the resource tree
+ * insert_resource_conflict - Inserts resource in the resource tree
 * @parent: parent of the new resource
 * @new: new resource to insert
 *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ * Returns 0 on success, conflict resource if the resource can't be inserted.
 *
- * This function is equivalent to request_resource when no conflict
+ * This function is equivalent to request_resource_conflict when no conflict
 * happens. If a conflict happens, and the conflicting resources
 * entirely fit within the range of the new resource, then the new
 * resource is inserted and the conflicting resources become children of
 * the new resource.
 */
-int insert_resource(struct resource *parent, struct resource *new)
+struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __insert_resource(parent, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = insert_resource_conflict(parent, new);
        return conflict ? -EBUSY : 0;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 7266b912139f..3c2a54f70ffe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -233,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 */
 static DEFINE_MUTEX(sched_domains_mutex);
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 #include <linux/cgroup.h>
@@ -243,13 +244,7 @@ static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_CGROUP_SCHED
        struct cgroup_subsys_state css;
-#endif
-#ifdef CONFIG_USER_SCHED
-        uid_t uid;
-#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
@@ -274,35 +269,7 @@ struct task_group {
        struct list_head children;
 };
-#ifdef CONFIG_USER_SCHED
-/* Helper function to pass uid information to create_sched_user() */
-void set_tg_uid(struct user_struct *user)
-{
-        user->tg->uid = user->uid;
-}
-/*
- * Root task group.
- *      Every UID task group (including init_task_group aka UID-0) will
- *      be a child to this group.
- */
-struct task_group root_task_group;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Default task group's sched entity on each cpu */
-static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
-/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -318,11 +285,7 @@ static int root_task_group_empty(void)
 }
 #endif
-#ifdef CONFIG_USER_SCHED
-# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
-#endif /* CONFIG_USER_SCHED */
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +311,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 {
        struct task_group *tg;
-#ifdef CONFIG_USER_SCHED
+#ifdef CONFIG_CGROUP_SCHED
-        rcu_read_lock();
-        tg = __task_cred(p)->user->tg;
-        rcu_read_unlock();
-#elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
 #else
@@ -364,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+        /*
+         * Strictly speaking this rcu_read_lock() is not needed since the
+         * task_group is tied to the cgroup, which in turn can never go away
+         * as long as there are tasks attached to it.
+         *
+         * However since task_group() uses task_subsys_state() which is an
+         * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
+         */
+        rcu_read_lock();
 #ifdef CONFIG_FAIR_GROUP_SCHED
        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
        p->se.parent = task_group(p)->se[cpu];
@@ -373,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
        p->rt.parent = task_group(p)->rt_se[cpu];
 #endif
+        rcu_read_unlock();
 }
 #else
@@ -383,7 +352,7 @@ static inline struct task_group *task_group(struct task_struct *p)
        return NULL;
 }
-#endif  /* CONFIG_GROUP_SCHED */
+#endif  /* CONFIG_CGROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -478,7 +447,6 @@ struct rt_rq {
        struct rq *rq;
        struct list_head leaf_rt_rq_list;
        struct task_group *tg;
-        struct sched_rt_entity *rt_se;
 #endif
 };
@@ -645,6 +613,11 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
+#define rcu_dereference_check_sched_domain(p) \
+        rcu_dereference_check((p), \
+                              rcu_read_lock_sched_held() || \
+                              lockdep_is_held(&sched_domains_mutex))
 /*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +626,7 @@ static inline int cpu_of(struct rq *rq)
 * preempt-disabled sections.
 */
 #define for_each_domain(cpu, __sd) \
-        for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 #define this_rq()               (&__get_cpu_var(runqueues))
@@ -941,16 +914,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
+ * Check whether the task is waking, we use this to synchronize against
+ * ttwu() so that task_cpu() reports a stable number.
+ *
+ * We need to make an exception for PF_STARTING tasks because the fork
+ * path might require task_rq_lock() to work, eg. it can call
+ * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+        return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+}
+/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
+        struct rq *rq;
        for (;;) {
-                struct rq *rq = task_rq(p);
+                while (task_is_waking(p))
+                        cpu_relax();
+                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
        }
@@ -967,10 +957,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        struct rq *rq;
        for (;;) {
+                while (task_is_waking(p))
+                        cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
@@ -1390,32 +1382,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
-        void *arg;
-        struct task_struct *(*start)(void *);
-        struct task_struct *(*next)(void *);
-};
-#ifdef CONFIG_SMP
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator);
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator);
-#endif
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
        CPUACCT_STAT_USER,      /* ... user mode */
@@ -1531,7 +1497,7 @@ static unsigned long target_load(int cpu, int type)
 static struct sched_group *group_of(int cpu)
 {
-        struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+        struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
        if (!sd)
                return NULL;
@@ -1566,7 +1532,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1701,16 +1667,6 @@ static void update_shares(struct sched_domain *sd)
        }
 }
-static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-        if (root_task_group_empty())
-                return;
-        raw_spin_unlock(&rq->lock);
-        update_shares(sd);
-        raw_spin_lock(&rq->lock);
-}
 static void update_h_load(long cpu)
 {
        if (root_task_group_empty())
@@ -1725,10 +1681,6 @@ static inline void update_shares(struct sched_domain *sd)
 {
 }
-static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-}
 #endif
 #ifdef CONFIG_PREEMPT
@@ -1805,6 +1757,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        raw_spin_unlock(&busiest->lock);
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        if (rq1 == rq2) {
+                raw_spin_lock(&rq1->lock);
+                __acquire(rq2->lock);   /* Fake it out ;) */
+        } else {
+                if (rq1 < rq2) {
+                        raw_spin_lock(&rq1->lock);
+                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                } else {
+                        raw_spin_lock(&rq2->lock);
+                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                }
+        }
+        update_rq_clock(rq1);
+        update_rq_clock(rq2);
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        raw_spin_unlock(&rq1->lock);
+        if (rq1 != rq2)
+                raw_spin_unlock(&rq2->lock);
+        else
+                __release(rq2->lock);
+}
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1831,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif
 }
-#include "sched_stats.h"
+static const struct sched_class rt_sched_class;
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
 #define sched_class_highest (&rt_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
+#include "sched_stats.h"
 static void inc_nr_running(struct rq *rq)
 {
        rq->nr_running++;
@@ -1883,13 +1876,14 @@ static void update_avg(u64 *avg, u64 sample)
        *avg += diff >> 3;
 }
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        if (wakeup)
                p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
-        p->sched_class->enqueue_task(rq, p, wakeup);
+        p->sched_class->enqueue_task(rq, p, wakeup, head);
        p->se.on_rq = 1;
 }
@@ -1912,6 +1906,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 }
 /*
+ * activate_task - move a task to the runqueue.
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+{
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible--;
+        enqueue_task(rq, p, wakeup, false);
+        inc_nr_running(rq);
+}
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+{
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible++;
+        dequeue_task(rq, p, sleep);
+        dec_nr_running(rq);
+}
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
+/*
 * __normal_prio - return the priority that is based on the static prio
 */
 static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1982,6 @@ static int effective_prio(struct task_struct *p)
        return p->prio;
 }
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, wakeup);
-        inc_nr_running(rq);
-}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible++;
-        dequeue_task(rq, p, sleep);
-        dec_nr_running(rq);
-}
 /**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
@@ -2320,14 +2321,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * Called from:
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- *
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- *  - fork, @p is stable because it isn't on the tasklist yet
+ * by:
 *
- *  - exec, @p is unstable, retry loop
+ *  exec:           is unstable, retry loop
- *
+ *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
- *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- *             we should be good.
 */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2371,7 +2370,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        struct rq *rq, *orig_rq;
+        struct rq *rq;
        if (!sched_feat(SYNC_WAKEUPS))
                wake_flags &= ~WF_SYNC;
@@ -2379,7 +2378,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        this_cpu = get_cpu();
        smp_wmb();
-        rq = orig_rq = task_rq_lock(p, &flags);
+        rq = task_rq_lock(p, &flags);
        update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
@@ -2410,14 +2409,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        __task_rq_unlock(rq);
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu)
+        if (cpu != orig_cpu) {
+                /*
+                 * Since we migrate the task without holding any rq->lock,
+                 * we need to be careful with task_rq_lock(), since that
+                 * might end up locking an invalid rq.
+                 */
                set_task_cpu(p, cpu);
+        }
-        rq = __task_rq_lock(p);
+        rq = cpu_rq(cpu);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
+        /*
+         * We migrated the task without holding either rq->lock, however
+         * since the task is not on the task list itself, nobody else
+         * will try and migrate the task, hence the rq should match the
+         * cpu we just moved it to.
+         */
+        WARN_ON(task_cpu(p) != cpu);
        WARN_ON(p->state != TASK_WAKING);
-        cpu = task_cpu(p);
 #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@ -2620,9 +2632,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
-#ifdef CONFIG_SMP
-        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
        set_task_cpu(p, cpu);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2652,8 +2661,29 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
+        int cpu __maybe_unused = get_cpu();
+#ifdef CONFIG_SMP
+        /*
+         * Fork balancing, do it here and not earlier because:
+         *  - cpus_allowed can change in the fork path
+         *  - any previously selected cpu might disappear through hotplug
+         *
+         * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+         * ->cpus_allowed is stable, we have preemption disabled, meaning
+         * cpu_online_mask is stable.
+         */
+        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, cpu);
+#endif
+        /*
+         * Since the task is not on the rq and we still have TASK_WAKING set
+         * nobody else will migrate this task.
+         */
+        rq = cpu_rq(cpu);
+        raw_spin_lock_irqsave(&rq->lock, flags);
-        rq = task_rq_lock(p, &flags);
        BUG_ON(p->state != TASK_WAKING);
        p->state = TASK_RUNNING;
        update_rq_clock(rq);
@@ -2665,6 +2695,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                p->sched_class->task_woken(rq, p);
 #endif
        task_rq_unlock(rq, &flags);
+        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3094,50 +3125,6 @@ static void update_cpu_load(struct rq *this_rq)
 #ifdef CONFIG_SMP
 /*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        if (rq1 == rq2) {
-                raw_spin_lock(&rq1->lock);
-                __acquire(rq2->lock);   /* Fake it out ;) */
-        } else {
-                if (rq1 < rq2) {
-                        raw_spin_lock(&rq1->lock);
-                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-                } else {
-                        raw_spin_lock(&rq2->lock);
-                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-                }
-        }
-        update_rq_clock(rq1);
-        update_rq_clock(rq2);
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        raw_spin_unlock(&rq1->lock);
-        if (rq1 != rq2)
-                raw_spin_unlock(&rq2->lock);
-        else
-                __release(rq2->lock);
-}
-/*
 * sched_exec - execve() is a valuable balancing opportunity, because at
 * this point the task has the smallest effective memory and cache footprint.
 */
@@ -3185,1771 +3172,6 @@ again:
        task_rq_unlock(rq, &flags);
 }
-/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
- */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-                      struct rq *this_rq, int this_cpu)
-{
-        deactivate_task(src_rq, p, 0);
-        set_task_cpu(p, this_cpu);
-        activate_task(this_rq, p, 0);
-        check_preempt_curr(this_rq, p, 0);
-}
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
-{
-        int tsk_cache_hot = 0;
-        /*
-         * We do not migrate tasks that are:
-         * 1) running (obviously), or
-         * 2) cannot be migrated to this CPU due to cpus_allowed, or
-         * 3) are cache-hot on their current CPU.
-         */
-        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-                schedstat_inc(p, se.nr_failed_migrations_affine);
-                return 0;
-        }
-        *all_pinned = 0;
-        if (task_running(rq, p)) {
-                schedstat_inc(p, se.nr_failed_migrations_running);
-                return 0;
-        }
-        /*
-         * Aggressive migration if:
-         * 1) task is cache cold, or
-         * 2) too many balance attempts have failed.
-         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
-        if (!tsk_cache_hot ||
-                sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-                if (tsk_cache_hot) {
-                        schedstat_inc(sd, lb_hot_gained[idle]);
-                        schedstat_inc(p, se.nr_forced_migrations);
-                }
-#endif
-                return 1;
-        }
-        if (tsk_cache_hot) {
-                schedstat_inc(p, se.nr_failed_migrations_hot);
-                return 0;
-        }
-        return 1;
-}
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator)
-{
-        int loops = 0, pulled = 0, pinned = 0;
-        struct task_struct *p;
-        long rem_load_move = max_load_move;
-        if (max_load_move == 0)
-                goto out;
-        pinned = 1;
-        /*
-         * Start the load-balancing iterator:
-         */
-        p = iterator->start(iterator->arg);
-next:
-        if (!p || loops++ > sysctl_sched_nr_migrate)
-                goto out;
-        if ((p->se.load.weight >> 1) > rem_load_move ||
-            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-        pull_task(busiest, p, this_rq, this_cpu);
-        pulled++;
-        rem_load_move -= p->se.load.weight;
-#ifdef CONFIG_PREEMPT
-        /*
-         * NEWIDLE balancing is a source of latency, so preemptible kernels
-         * will stop after the first task is pulled to minimize the critical
-         * section.
-         */
-        if (idle == CPU_NEWLY_IDLE)
-                goto out;
-#endif
-        /*
-         * We only want to steal up to the prescribed amount of weighted load.
-         */
-        if (rem_load_move > 0) {
-                if (p->prio < *this_best_prio)
-                        *this_best_prio = p->prio;
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-out:
-        /*
-         * Right now, this is one of only two places pull_task() is called,
-         * so we can safely collect pull_task() stats here rather than
-         * inside pull_task().
-         */
-        schedstat_add(sd, lb_gained[idle], pulled);
-        if (all_pinned)
-                *all_pinned = pinned;
-        return max_load_move - rem_load_move;
-}
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      unsigned long max_load_move,
-                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned)
-{
-        const struct sched_class *class = sched_class_highest;
-        unsigned long total_load_moved = 0;
-        int this_best_prio = this_rq->curr->prio;
-        do {
-                total_load_moved +=
-                        class->load_balance(this_rq, this_cpu, busiest,
-                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned, &this_best_prio);
-                class = class->next;
-#ifdef CONFIG_PREEMPT
-                /*
-                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
-                 * the critical section.
-                 */
-                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                        break;
-#endif
-        } while (class && max_load_move > total_load_moved);
-        return total_load_moved > 0;
-}
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator)
-{
-        struct task_struct *p = iterator->start(iterator->arg);
-        int pinned = 0;
-        while (p) {
-                if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                        pull_task(busiest, p, this_rq, this_cpu);
-                        /*
-                         * Right now, this is only the second place pull_task()
-                         * is called, so we can safely collect pull_task()
-                         * stats here rather than inside pull_task().
-                         */
-                        schedstat_inc(sd, lb_gained[idle]);
-                        return 1;
-                }
-                p = iterator->next(iterator->arg);
-        }
-        return 0;
-}
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                         struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        const struct sched_class *class;
-        for_each_class(class) {
-                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
-                        return 1;
-        }
-        return 0;
-}
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *              during load balancing.
- */
-struct sd_lb_stats {
-        struct sched_group *busiest; /* Busiest group in this sd */
-        struct sched_group *this;  /* Local group in this sd */
-        unsigned long total_load;  /* Total load of all groups in sd */
-        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        unsigned long avg_load;    /* Average load across all groups in sd */
-        /** Statistics of this group */
-        unsigned long this_load;
-        unsigned long this_load_per_task;
-        unsigned long this_nr_running;
-        /* Statistics of the busiest group */
-        unsigned long max_load;
-        unsigned long busiest_load_per_task;
-        unsigned long busiest_nr_running;
-        int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        int power_savings_balance; /* Is powersave balance needed for this sd */
-        struct sched_group *group_min; /* Least loaded group in sd */
-        struct sched_group *group_leader; /* Group which relieves group_min */
-        unsigned long min_load_per_task; /* load_per_task in group_min */
-        unsigned long leader_nr_running; /* Nr running of group_leader */
-        unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
-        unsigned long avg_load; /*Avg load across the CPUs of the group */
-        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_nr_running; /* Nr tasks running in the group */
-        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-        unsigned long group_capacity;
-        int group_imb; /* Is there an imbalance in the group ? */
-};
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-        return cpumask_first(sched_group_cpus(group));
-}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                        enum cpu_idle_type idle)
-{
-        int load_idx;
-        switch (idle) {
-        case CPU_NOT_IDLE:
-                load_idx = sd->busy_idx;
-                break;
-        case CPU_NEWLY_IDLE:
-                load_idx = sd->newidle_idx;
-                break;
-        default:
-                load_idx = sd->idle_idx;
-                break;
-        }
-        return load_idx;
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        /*
-         * Busy processors will not participate in power savings
-         * balance.
-         */
-        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                sds->power_savings_balance = 0;
-        else {
-                sds->power_savings_balance = 1;
-                sds->min_nr_running = ULONG_MAX;
-                sds->leader_nr_running = 0;
-        }
-}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- *              load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        if (!sds->power_savings_balance)
-                return;
-        /*
-         * If the local group is idle or completely loaded
-         * no need to do power savings balance at this domain
-         */
-        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-                                !sds->this_nr_running))
-                sds->power_savings_balance = 0;
-        /*
-         * If a group is already running at full capacity or idle,
-         * don't include that group in power savings calculations
-         */
-        if (!sds->power_savings_balance ||
-                sgs->sum_nr_running >= sgs->group_capacity ||
-                !sgs->sum_nr_running)
-                return;
-        /*
-         * Calculate the group which has the least non-idle load.
-         * This is the group from where we need to pick up the load
-         * for saving power
-         */
-        if ((sgs->sum_nr_running < sds->min_nr_running) ||
-            (sgs->sum_nr_running == sds->min_nr_running &&
-             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-                sds->group_min = group;
-                sds->min_nr_running = sgs->sum_nr_running;
-                sds->min_load_per_task = sgs->sum_weighted_load /
-                                                sgs->sum_nr_running;
-        }
-        /*
-         * Calculate the group which is almost near its
-         * capacity but still has some space to pick up some load
-         * from other group and save more power
-         */
-        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-                return;
-        if (sgs->sum_nr_running > sds->leader_nr_running ||
-            (sgs->sum_nr_running == sds->leader_nr_running &&
-             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-                sds->group_leader = group;
-                sds->leader_nr_running = sgs->sum_nr_running;
-        }
-}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *      under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        if (!sds->power_savings_balance)
-                return 0;
-        if (sds->this != sds->group_leader ||
-                        sds->group_leader == sds->group_min)
-                return 0;
-        *imbalance = sds->min_load_per_task;
-        sds->busiest = sds->group_min;
-        return 1;
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        return;
-}
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        return;
-}
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return SCHED_LOAD_SCALE;
-}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_freq_power(sd, cpu);
-}
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long smt_gain = sd->smt_gain;
-        smt_gain /= weight;
-        return smt_gain;
-}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_smt_power(sd, cpu);
-}
-unsigned long scale_rt_power(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
-        sched_avg_update(rq);
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
-        if (unlikely((s64)total < SCHED_LOAD_SCALE))
-                total = SCHED_LOAD_SCALE;
-        total >>= SCHED_LOAD_SHIFT;
-        return div_u64(available, total);
-}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long power = SCHED_LOAD_SCALE;
-        struct sched_group *sdg = sd->groups;
-        if (sched_feat(ARCH_POWER))
-                power *= arch_scale_freq_power(sd, cpu);
-        else
-                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                if (sched_feat(ARCH_POWER))
-                        power *= arch_scale_smt_power(sd, cpu);
-                else
-                        power *= default_scale_smt_power(sd, cpu);
-                power >>= SCHED_LOAD_SHIFT;
-        }
-        power *= scale_rt_power(cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if (!power)
-                power = 1;
-        sdg->cpu_power = power;
-}
-static void update_group_power(struct sched_domain *sd, int cpu)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
-        if (!child) {
-                update_cpu_power(sd, cpu);
-                return;
-        }
-        power = 0;
-        group = child->groups;
-        do {
-                power += group->cpu_power;
-                group = group->next;
-        } while (group != child->groups);
-        sdg->cpu_power = power;
-}
-/**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
- * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
- * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sgs: variable to hold the statistics for this group.
- */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
-                        struct sched_group *group, int this_cpu,
-                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
-                        int local_group, const struct cpumask *cpus,
-                        int *balance, struct sg_lb_stats *sgs)
-{
-        unsigned long load, max_cpu_load, min_cpu_load;
-        int i;
-        unsigned int balance_cpu = -1, first_idle_cpu = 0;
-        unsigned long sum_avg_load_per_task;
-        unsigned long avg_load_per_task;
-        if (local_group) {
-                balance_cpu = group_first_cpu(group);
-                if (balance_cpu == this_cpu)
-                        update_group_power(sd, this_cpu);
-        }
-        /* Tally up the load of all CPUs in the group */
-        sum_avg_load_per_task = avg_load_per_task = 0;
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-                struct rq *rq = cpu_rq(i);
-                if (*sd_idle && rq->nr_running)
-                        *sd_idle = 0;
-                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
-                                first_idle_cpu = 1;
-                                balance_cpu = i;
-                        }
-                        load = target_load(i, load_idx);
-                } else {
-                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                }
-                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
-                sgs->sum_weighted_load += weighted_cpuload(i);
-                sum_avg_load_per_task += cpu_avg_load_per_task(i);
-        }
-        /*
-         * First idle cpu or the first cpu(busiest) in this sched group
-         * is eligible for doing load balancing at this and above
-         * domains. In the newly idle case, we will allow all the cpu's
-         * to do the newly idle load balance.
-         */
-        if (idle != CPU_NEWLY_IDLE && local_group &&
-            balance_cpu != this_cpu && balance) {
-                *balance = 0;
-                return;
-        }
-        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of two tasks.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
-        avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
-                group->cpu_power;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-                sgs->group_imb = 1;
-        sgs->group_capacity =
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-}
-/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sds: variable to hold the statistics for this sched_domain.
- */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                        enum cpu_idle_type idle, int *sd_idle,
-                        const struct cpumask *cpus, int *balance,
-                        struct sd_lb_stats *sds)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group = sd->groups;
-        struct sg_lb_stats sgs;
-        int load_idx, prefer_sibling = 0;
-        if (child && child->flags & SD_PREFER_SIBLING)
-                prefer_sibling = 1;
-        init_sd_power_savings_stats(sd, sds, idle);
-        load_idx = get_sd_load_idx(sd, idle);
-        do {
-                int local_group;
-                local_group = cpumask_test_cpu(this_cpu,
-                                               sched_group_cpus(group));
-                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
-                                local_group, cpus, balance, &sgs);
-                if (local_group && balance && !(*balance))
-                        return;
-                sds->total_load += sgs.group_load;
-                sds->total_pwr += group->cpu_power;
-                /*
-                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the group capacity to one so that we'll try
-                 * and move all the excess tasks away.
-                 */
-                if (prefer_sibling)
-                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
-                if (local_group) {
-                        sds->this_load = sgs.avg_load;
-                        sds->this = group;
-                        sds->this_nr_running = sgs.sum_nr_running;
-                        sds->this_load_per_task = sgs.sum_weighted_load;
-                } else if (sgs.avg_load > sds->max_load &&
-                           (sgs.sum_nr_running > sgs.group_capacity ||
-                                sgs.group_imb)) {
-                        sds->max_load = sgs.avg_load;
-                        sds->busiest = group;
-                        sds->busiest_nr_running = sgs.sum_nr_running;
-                        sds->busiest_load_per_task = sgs.sum_weighted_load;
-                        sds->group_imb = sgs.group_imb;
-                }
-                update_sd_power_savings_stats(group, sds, local_group, &sgs);
-                group = group->next;
-        } while (group != sd->groups);
-}
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *                      amongst the groups of a sched_domain, during
- *                      load balancing.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
- */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
-                                int this_cpu, unsigned long *imbalance)
-{
-        unsigned long tmp, pwr_now = 0, pwr_move = 0;
-        unsigned int imbn = 2;
-        if (sds->this_nr_running) {
-                sds->this_load_per_task /= sds->this_nr_running;
-                if (sds->busiest_load_per_task >
-                                sds->this_load_per_task)
-                        imbn = 1;
-        } else
-                sds->this_load_per_task =
-                        cpu_avg_load_per_task(this_cpu);
-        if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
-                        sds->busiest_load_per_task * imbn) {
-                *imbalance = sds->busiest_load_per_task;
-                return;
-        }
-        /*
-         * OK, we don't have enough imbalance to justify moving tasks,
-         * however we may be able to increase total CPU power used by
-         * moving them.
-         */
-        pwr_now += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load);
-        pwr_now += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load);
-        pwr_now /= SCHED_LOAD_SCALE;
-        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                sds->busiest->cpu_power;
-        if (sds->max_load > tmp)
-                pwr_move += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load - tmp);
-        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->cpu_power <
-                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-                tmp = (sds->max_load * sds->busiest->cpu_power) /
-                        sds->this->cpu_power;
-        else
-                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                        sds->this->cpu_power;
-        pwr_move += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load + tmp);
-        pwr_move /= SCHED_LOAD_SCALE;
-        /* Move if we gain throughput */
-        if (pwr_move > pwr_now)
-                *imbalance = sds->busiest_load_per_task;
-}
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *                       groups of a given sched_domain during load balance.
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
- */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
-                unsigned long *imbalance)
-{
-        unsigned long max_pull;
-        /*
-         * In the presence of smp nice balancing, certain scenarios can have
-         * max load less than avg load(as we skip the groups at or below
-         * its cpu_power, while calculating max_load..)
-         */
-        if (sds->max_load < sds->avg_load) {
-                *imbalance = 0;
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-        }
-        /* Don't want to pull so many tasks that a group would go idle */
-        max_pull = min(sds->max_load - sds->avg_load,
-                        sds->max_load - sds->busiest_load_per_task);
-        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->cpu_power,
-                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
-                        / SCHED_LOAD_SCALE;
-        /*
-         * if *imbalance is less than the average load per runnable task
-         * there is no gaurantee that any tasks will be moved so we'll have
-         * a think about bumping its value to force at least one task to be
-         * moved
-         */
-        if (*imbalance < sds->busiest_load_per_task)
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-}
-/******* find_busiest_group() helpers end here *********************/
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
- *
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
- *
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- *      is the appropriate cpu to perform load balancing at this_level.
- *
- * Returns:     - the busiest group if imbalance exists.
- *              - If no imbalance and user has opted for power-savings balance,
- *                 return the least loaded group whose CPUs can be
- *                 put to idle by rebalancing its tasks onto our group.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   int *sd_idle, const struct cpumask *cpus, int *balance)
-{
-        struct sd_lb_stats sds;
-        memset(&sds, 0, sizeof(sds));
-        /*
-         * Compute the various statistics relavent for load balancing at
-         * this level.
-         */
-        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-                                        balance, &sds);
-        /* Cases where imbalance does not exist from POV of this_cpu */
-        /* 1) this_cpu is not the appropriate cpu to perform load balancing
-         *    at this level.
-         * 2) There is no busy sibling group to pull from.
-         * 3) This group is the busiest group.
-         * 4) This group is more busy than the avg busieness at this
-         *    sched_domain.
-         * 5) The imbalance is within the specified limit.
-         * 6) Any rebalance would lead to ping-pong
-         */
-        if (balance && !(*balance))
-                goto ret;
-        if (!sds.busiest || sds.busiest_nr_running == 0)
-                goto out_balanced;
-        if (sds.this_load >= sds.max_load)
-                goto out_balanced;
-        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-        if (sds.this_load >= sds.avg_load)
-                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                goto out_balanced;
-        sds.busiest_load_per_task /= sds.busiest_nr_running;
-        if (sds.group_imb)
-                sds.busiest_load_per_task =
-                        min(sds.busiest_load_per_task, sds.avg_load);
-        /*
-         * We're trying to get all the cpus to the average_load, so we don't
-         * want to push ourselves above the average load, nor do we wish to
-         * reduce the max loaded cpu below the average load, as either of these
-         * actions would just result in more rebalancing later, and ping-pong
-         * tasks around. Thus we look for the minimum possible imbalance.
-         * Negative imbalances (*we* are more loaded than anyone else) will
-         * be counted as no imbalance for these purposes -- we can't fix that
-         * by pulling tasks to us. Be careful of negative numbers as they'll
-         * appear as very large values with unsigned longs.
-         */
-        if (sds.max_load <= sds.busiest_load_per_task)
-                goto out_balanced;
-        /* Looks like there is an imbalance. Compute it */
-        calculate_imbalance(&sds, this_cpu, imbalance);
-        return sds.busiest;
-out_balanced:
-        /*
-         * There is no obvious imbalance. But check if we can do some balancing
-         * to save power.
-         */
-        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-                return sds.busiest;
-ret:
-        *imbalance = 0;
-        return NULL;
-}
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
- */
-static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                   unsigned long imbalance, const struct cpumask *cpus)
-{
-        struct rq *busiest = NULL, *rq;
-        unsigned long max_load = 0;
-        int i;
-        for_each_cpu(i, sched_group_cpus(group)) {
-                unsigned long power = power_of(i);
-                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
-                unsigned long wl;
-                if (!cpumask_test_cpu(i, cpus))
-                        continue;
-                rq = cpu_rq(i);
-                wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
-                wl /= power;
-                if (capacity && rq->nr_running == 1 && wl > imbalance)
-                        continue;
-                if (wl > max_load) {
-                        max_load = wl;
-                        busiest = rq;
-                }
-        }
-        return busiest;
-}
-/*
- * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- * so long as it is large enough.
- */
-#define MAX_PINNED_INTERVAL     512
-/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- */
-static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance)
-{
-        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
-        struct sched_group *group;
-        unsigned long imbalance;
-        struct rq *busiest;
-        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as CPU_IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[idle]);
-redo:
-        update_shares(sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                   cpus, balance);
-        if (*balance == 0)
-                goto out_balanced;
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[idle]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, idle, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[idle]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[idle], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /*
-                 * Attempt to move tasks. If find_busiest_group has found
-                 * an imbalance but busiest->nr_running <= 1, the group is
-                 * still unbalanced. ld_moved simply stays zero, so it is
-                 * correctly treated as an imbalance.
-                 */
-                local_irq_save(flags);
-                double_rq_lock(this_rq, busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                      imbalance, sd, idle, &all_pinned);
-                double_rq_unlock(this_rq, busiest);
-                local_irq_restore(flags);
-                /*
-                 * some other cpu did the load balance for us.
-                 */
-                if (ld_moved && this_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
-                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                        goto out_balanced;
-                }
-        }
-        if (!ld_moved) {
-                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
-                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        raw_spin_lock_irqsave(&busiest->lock, flags);
-                        /* don't kick the migration_thread, if the curr
-                         * task on busiest cpu can't be moved to this_cpu
-                         */
-                        if (!cpumask_test_cpu(this_cpu,
-                                              &busiest->curr->cpus_allowed)) {
-                                raw_spin_unlock_irqrestore(&busiest->lock,
-                                                            flags);
-                                all_pinned = 1;
-                                goto out_one_pinned;
-                        }
-                        if (!busiest->active_balance) {
-                                busiest->active_balance = 1;
-                                busiest->push_cpu = this_cpu;
-                                active_balance = 1;
-                        }
-                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
-                        if (active_balance)
-                                wake_up_process(busiest->migration_thread);
-                        /*
-                         * We've kicked active balancing, reset the failure
-                         * counter.
-                         */
-                        sd->nr_balance_failed = sd->cache_nice_tries+1;
-                }
-        } else
-                sd->nr_balance_failed = 0;
-        if (likely(!active_balance)) {
-                /* We were unbalanced, so reset the balancing interval */
-                sd->balance_interval = sd->min_interval;
-        } else {
-                /*
-                 * If we've begun active balancing, start to back off. This
-                 * case may not be covered by the all_pinned logic if there
-                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
-                 */
-                if (sd->balance_interval < sd->max_interval)
-                        sd->balance_interval *= 2;
-        }
-        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        goto out;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[idle]);
-        sd->nr_balance_failed = 0;
-out_one_pinned:
-        /* tune up the balancing interval */
-        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
-                        (sd->balance_interval < sd->max_interval))
-                sd->balance_interval *= 2;
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        else
-                ld_moved = 0;
-out:
-        if (ld_moved)
-                update_shares(sd);
-        return ld_moved;
-}
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
- * this_rq is locked.
- */
-static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
-{
-        struct sched_group *group;
-        struct rq *busiest = NULL;
-        unsigned long imbalance;
-        int ld_moved = 0;
-        int sd_idle = 0;
-        int all_pinned = 0;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
-redo:
-        update_shares_locked(this_rq, sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
-                                   &sd_idle, cpus, NULL);
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /* Attempt to move tasks */
-                double_lock_balance(this_rq, busiest);
-                /* this_rq->clock is already updated */
-                update_rq_clock(busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                        imbalance, sd, CPU_NEWLY_IDLE,
-                                        &all_pinned);
-                double_unlock_balance(this_rq, busiest);
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                }
-        }
-        if (!ld_moved) {
-                int active_balance = 0;
-                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
-                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                        return -1;
-                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-                        return -1;
-                if (sd->nr_balance_failed++ < 2)
-                        return -1;
-                /*
-                 * The only task running in a non-idle cpu can be moved to this
-                 * cpu in an attempt to completely freeup the other CPU
-                 * package. The same method used to move task in load_balance()
-                 * have been extended for load_balance_newidle() to speedup
-                 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
-                 *
-                 * The package power saving logic comes from
-                 * find_busiest_group().  If there are no imbalance, then
-                 * f_b_g() will return NULL.  However when sched_mc={1,2} then
-                 * f_b_g() will select a group from which a running task may be
-                 * pulled to this cpu in order to make the other package idle.
-                 * If there is no opportunity to make a package idle and if
-                 * there are no imbalance, then f_b_g() will return NULL and no
-                 * action will be taken in load_balance_newidle().
-                 *
-                 * Under normal task pull operation due to imbalance, there
-                 * will be more than one task in the source run queue and
-                 * move_tasks() will succeed.  ld_moved will be true and this
-                 * active balance code will not be triggered.
-                 */
-                /* Lock busiest in correct order while this_rq is held */
-                double_lock_balance(this_rq, busiest);
-                /*
-                 * don't kick the migration_thread, if the curr
-                 * task on busiest cpu can't be moved to this_cpu
-                 */
-                if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
-                        double_unlock_balance(this_rq, busiest);
-                        all_pinned = 1;
-                        return ld_moved;
-                }
-                if (!busiest->active_balance) {
-                        busiest->active_balance = 1;
-                        busiest->push_cpu = this_cpu;
-                        active_balance = 1;
-                }
-                double_unlock_balance(this_rq, busiest);
-                /*
-                 * Should not call ttwu while holding a rq->lock
-                 */
-                raw_spin_unlock(&this_rq->lock);
-                if (active_balance)
-                        wake_up_process(busiest->migration_thread);
-                raw_spin_lock(&this_rq->lock);
-        } else
-                sd->nr_balance_failed = 0;
-        update_shares_locked(this_rq, sd);
-        return ld_moved;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
-        sd->nr_balance_failed = 0;
-        return 0;
-}
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static void idle_balance(int this_cpu, struct rq *this_rq)
-{
-        struct sched_domain *sd;
-        int pulled_task = 0;
-        unsigned long next_balance = jiffies + HZ;
-        this_rq->idle_stamp = this_rq->clock;
-        if (this_rq->avg_idle < sysctl_sched_migration_cost)
-                return;
-        for_each_domain(this_cpu, sd) {
-                unsigned long interval;
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                if (sd->flags & SD_BALANCE_NEWIDLE)
-                        /* If we've pulled tasks over stop searching: */
-                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                           sd);
-                interval = msecs_to_jiffies(sd->balance_interval);
-                if (time_after(next_balance, sd->last_balance + interval))
-                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
-                        this_rq->idle_stamp = 0;
-                        break;
-                }
-        }
-        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-                /*
-                 * We are going idle. next_balance may be set based on
-                 * a busy processor. So reset next_balance.
-                 */
-                this_rq->next_balance = next_balance;
-        }
-}
-/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
- */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
-{
-        int target_cpu = busiest_rq->push_cpu;
-        struct sched_domain *sd;
-        struct rq *target_rq;
-        /* Is there any task to move? */
-        if (busiest_rq->nr_running <= 1)
-                return;
-        target_rq = cpu_rq(target_cpu);
-        /*
-         * This condition is "impossible", if it occurs
-         * we need to fix it. Originally reported by
-         * Bjorn Helgaas on a 128-cpu setup.
-         */
-        BUG_ON(busiest_rq == target_rq);
-        /* move a task from busiest_rq to target_rq */
-        double_lock_balance(busiest_rq, target_rq);
-        update_rq_clock(busiest_rq);
-        update_rq_clock(target_rq);
-        /* Search for an sd spanning us and the target CPU. */
-        for_each_domain(target_cpu, sd) {
-                if ((sd->flags & SD_LOAD_BALANCE) &&
-                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                                break;
-        }
-        if (likely(sd)) {
-                schedstat_inc(sd, alb_count);
-                if (move_one_task(target_rq, target_cpu, busiest_rq,
-                                  sd, CPU_IDLE))
-                        schedstat_inc(sd, alb_pushed);
-                else
-                        schedstat_inc(sd, alb_failed);
-        }
-        double_unlock_balance(busiest_rq, target_rq);
-}
-#ifdef CONFIG_NO_HZ
-static struct {
-        atomic_t load_balancer;
-        cpumask_var_t cpu_mask;
-        cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-        .load_balancer = ATOMIC_INIT(-1),
-};
-int get_nohz_load_balancer(void)
-{
-        return atomic_read(&nohz.load_balancer);
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:        The cpu whose lowest level of sched domain is to
- *              be returned.
- * @flag:       The flag to check for the lowest sched_domain
- *              for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-        struct sched_domain *sd;
-        for_each_domain(cpu, sd)
-                if (sd && (sd->flags & flag))
-                        break;
-        return sd;
-}
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:        The cpu whose domains we're iterating over.
- * @sd:         variable holding the value of the power_savings_sd
- *              for cpu.
- * @flag:       The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-        for (sd = lowest_flag_domain(cpu, flag); \
-                (sd && (sd->flags & flag)); sd = sd->parent)
-/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group:  group to be checked for semi-idleness
- *
- * Returns:     1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
-        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
-                                        sched_group_cpus(ilb_group));
-        /*
-         * A sched_group is semi-idle when it has atleast one busy cpu
-         * and atleast one idle cpu.
-         */
-        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
-                return 0;
-        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
-                return 0;
-        return 1;
-}
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:        The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:     Returns the id of the idle load balancer if it exists,
- *              Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
-{
-        struct sched_domain *sd;
-        struct sched_group *ilb_group;
-        /*
-         * Have idle load balancer selection from semi-idle packages only
-         * when power-aware load balancing is enabled
-         */
-        if (!(sched_smt_power_savings || sched_mc_power_savings))
-                goto out_done;
-        /*
-         * Optimize for the case when we have no idle CPUs or only one
-         * idle CPU. Don't walk the sched_domain hierarchy in such cases
-         */
-        if (cpumask_weight(nohz.cpu_mask) < 2)
-                goto out_done;
-        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilb_group = sd->groups;
-                do {
-                        if (is_semi_idle_group(ilb_group))
-                                return cpumask_first(nohz.ilb_grp_nohz_mask);
-                        ilb_group = ilb_group->next;
-                } while (ilb_group != sd->groups);
-        }
-out_done:
-        return cpumask_first(nohz.cpu_mask);
-}
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-        return cpumask_first(nohz.cpu_mask);
-}
-#endif
-/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
- *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
- *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
- */
-int select_nohz_load_balancer(int stop_tick)
-{
-        int cpu = smp_processor_id();
-        if (stop_tick) {
-                cpu_rq(cpu)->in_nohz_recently = 1;
-                if (!cpu_active(cpu)) {
-                        if (atomic_read(&nohz.load_balancer) != cpu)
-                                return 0;
-                        /*
-                         * If we are going offline and still the leader,
-                         * give up!
-                         */
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-                        return 0;
-                }
-                cpumask_set_cpu(cpu, nohz.cpu_mask);
-                /* time for ilb owner also to sleep */
-                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
-                        if (atomic_read(&nohz.load_balancer) == cpu)
-                                atomic_set(&nohz.load_balancer, -1);
-                        return 0;
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /* make me the ilb owner */
-                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu) {
-                        int new_ilb;
-                        if (!(sched_smt_power_savings ||
-                                                sched_mc_power_savings))
-                                return 1;
-                        /*
-                         * Check to see if there is a more power-efficient
-                         * ilb.
-                         */
-                        new_ilb = find_new_ilb(cpu);
-                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                                atomic_set(&nohz.load_balancer, -1);
-                                resched_cpu(new_ilb);
-                                return 0;
-                        }
-                        return 1;
-                }
-        } else {
-                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-                        return 0;
-                cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                if (atomic_read(&nohz.load_balancer) == cpu)
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-        }
-        return 0;
-}
-#endif
-static DEFINE_SPINLOCK(balancing);
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
-{
-        int balance = 1;
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long interval;
-        struct sched_domain *sd;
-        /* Earliest time when we have to do rebalance again */
-        unsigned long next_balance = jiffies + 60*HZ;
-        int update_next_balance = 0;
-        int need_serialize;
-        for_each_domain(cpu, sd) {
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                interval = sd->balance_interval;
-                if (idle != CPU_IDLE)
-                        interval *= sd->busy_factor;
-                /* scale ms to jiffies */
-                interval = msecs_to_jiffies(interval);
-                if (unlikely(!interval))
-                        interval = 1;
-                if (interval > HZ*NR_CPUS/10)
-                        interval = HZ*NR_CPUS/10;
-                need_serialize = sd->flags & SD_SERIALIZE;
-                if (need_serialize) {
-                        if (!spin_trylock(&balancing))
-                                goto out;
-                }
-                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance)) {
-                                /*
-                                 * We've pulled tasks over so either we're no
-                                 * longer idle, or one of our SMT siblings is
-                                 * not idle.
-                                 */
-                                idle = CPU_NOT_IDLE;
-                        }
-                        sd->last_balance = jiffies;
-                }
-                if (need_serialize)
-                        spin_unlock(&balancing);
-out:
-                if (time_after(next_balance, sd->last_balance + interval)) {
-                        next_balance = sd->last_balance + interval;
-                        update_next_balance = 1;
-                }
-                /*
-                 * Stop the load balance at this level. There is another
-                 * CPU in our sched group which is doing load balancing more
-                 * actively.
-                 */
-                if (!balance)
-                        break;
-        }
-        /*
-         * next_balance will be updated only when there is a need.
-         * When the cpu is attached to null domain for ex, it will not be
-         * updated.
-         */
-        if (likely(update_next_balance))
-                rq->next_balance = next_balance;
-}
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static void run_rebalance_domains(struct softirq_action *h)
-{
-        int this_cpu = smp_processor_id();
-        struct rq *this_rq = cpu_rq(this_cpu);
-        enum cpu_idle_type idle = this_rq->idle_at_tick ?
-                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
-        /*
-         * If this cpu is the owner for idle load balancing, then do the
-         * balancing on behalf of the other idle cpus whose ticks are
-         * stopped.
-         */
-        if (this_rq->idle_at_tick &&
-            atomic_read(&nohz.load_balancer) == this_cpu) {
-                struct rq *rq;
-                int balance_cpu;
-                for_each_cpu(balance_cpu, nohz.cpu_mask) {
-                        if (balance_cpu == this_cpu)
-                                continue;
-                        /*
-                         * If this cpu gets work to do, stop the load balancing
-                         * work being done for other cpus. Next load
-                         * balancing owner will pick it up.
-                         */
-                        if (need_resched())
-                                break;
-                        rebalance_domains(balance_cpu, CPU_IDLE);
-                        rq = cpu_rq(balance_cpu);
-                        if (time_after(this_rq->next_balance, rq->next_balance))
-                                this_rq->next_balance = rq->next_balance;
-                }
-        }
-#endif
-}
-static inline int on_null_domain(int cpu)
-{
-        return !rcu_dereference(cpu_rq(cpu)->sd);
-}
-/*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
- */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
-{
-#ifdef CONFIG_NO_HZ
-        /*
-         * If we were in the nohz mode recently and busy at the current
-         * scheduler tick, then check if we need to nominate new idle
-         * load balancer.
-         */
-        if (rq->in_nohz_recently && !rq->idle_at_tick) {
-                rq->in_nohz_recently = 0;
-                if (atomic_read(&nohz.load_balancer) == cpu) {
-                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                        atomic_set(&nohz.load_balancer, -1);
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        int ilb = find_new_ilb(cpu);
-                        if (ilb < nr_cpu_ids)
-                                resched_cpu(ilb);
-                }
-        }
-        /*
-         * If this cpu is idle and doing idle load balancing for all the
-         * cpus with ticks stopped, is it time for that to stop?
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-                resched_cpu(cpu);
-                return;
-        }
-        /*
-         * If this cpu is idle and the idle load balancing is done by
-         * someone else, then no need raise the SCHED_SOFTIRQ
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-            cpumask_test_cpu(cpu, nohz.cpu_mask))
-                return;
-#endif
-        /* Don't need to rebalance while attached to NULL domain */
-        if (time_after_eq(jiffies, rq->next_balance) &&
-            likely(!on_null_domain(cpu)))
-                raise_softirq(SCHED_SOFTIRQ);
-}
-#else   /* CONFIG_SMP */
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5568,7 +3790,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the mutex owner just released it and exited.
         */
        if (probe_kernel_address(&owner->cpu, cpu))
-                goto out;
+                return 0;
 #else
        cpu = owner->cpu;
 #endif
@@ -5578,14 +3800,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the cpu field may no longer be valid.
         */
        if (cpu >= nr_cpumask_bits)
-                goto out;
+                return 0;
        /*
         * We need to validate that we can do a
         * get_cpu() and that we have the percpu area.
         */
        if (!cpu_online(cpu))
-                goto out;
+                return 0;
        rq = cpu_rq(cpu);
@@ -5604,7 +3826,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                cpu_relax();
        }
-out:
        return 1;
 }
 #endif
@@ -6049,7 +4271,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -6057,6 +4279,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        update_rq_clock(rq);
        oldprio = p->prio;
+        prev_class = p->sched_class;
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -6074,7 +4297,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, oldprio < prio);
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -6118,7 +4341,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, false);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -6141,7 +4364,7 @@ int can_nice(const struct task_struct *p, const int nice)
        /* convert nice value [19,-20] to rlimit style value [1,40] */
        int nice_rlim = 20 - nice;
-        return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
 }
@@ -6276,7 +4499,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        struct rq *rq;
        int reset_on_fork;
@@ -6318,7 +4541,7 @@ recheck:
                        if (!lock_task_sighand(p, &flags))
                                return -ESRCH;
-                        rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                        rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
                        unlock_task_sighand(p, &flags);
                        /* can't set/change the rt policy */
@@ -6390,6 +4613,7 @@ recheck:
        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
+        prev_class = p->sched_class;
        __setscheduler(rq, p, policy, param->sched_priority);
        if (running)
@@ -6689,7 +4913,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        int ret;
        cpumask_var_t mask;
-        if (len < cpumask_size())
+        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+                return -EINVAL;
+        if (len & (sizeof(unsigned long)-1))
                return -EINVAL;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -6697,10 +4923,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        ret = sched_getaffinity(pid, mask);
        if (ret == 0) {
-                if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                size_t retlen = min_t(size_t, len, cpumask_size());
+                if (copy_to_user(user_mask_ptr, mask, retlen))
                        ret = -EFAULT;
                else
-                        ret = cpumask_size();
+                        ret = retlen;
        }
        free_cpumask_var(mask);
@@ -7140,23 +5368,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        struct rq *rq;
        int ret = 0;
-        /*
-         * Since we rely on wake-ups to migrate sleeping tasks, don't change
-         * the ->cpus_allowed mask from under waking tasks, which would be
-         * possible when we change rq->lock in ttwu(), so synchronize against
-         * TASK_WAKING to avoid that.
-         */
-again:
-        while (p->state == TASK_WAKING)
-                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (p->state == TASK_WAKING) {
-                task_rq_unlock(rq, &flags);
-                goto again;
-        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
@@ -7185,7 +5398,7 @@ again:
                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(rq->migration_thread);
+                wake_up_process(mt);
                put_task_struct(mt);
                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
@@ -9208,11 +7421,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
                                           char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                            struct sysdev_class_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
@@ -9224,11 +7439,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
                                            char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                             struct sysdev_class_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
@@ -9443,7 +7660,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        tg->rt_rq[cpu] = rt_rq;
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
-        rt_rq->rt_se = rt_se;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
        if (add)
                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9474,9 +7690,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_USER_SCHED
-        alloc_size *= 2;
-#endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
        alloc_size += num_possible_cpus() * cpumask_size();
 #endif
@@ -9490,13 +7703,6 @@ void __init sched_init(void)
                init_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.se = (struct sched_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9505,13 +7711,6 @@ void __init sched_init(void)
                init_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.rt_rq = (struct rt_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
@@ -9531,22 +7730,13 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&init_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_USER_SCHED
-        init_rt_bandwidth(&root_task_group.rt_bandwidth,
-                        global_rt_period(), RUNTIME_INF);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
        INIT_LIST_HEAD(&init_task_group.children);
-#ifdef CONFIG_USER_SCHED
+#endif /* CONFIG_CGROUP_SCHED */
-        INIT_LIST_HEAD(&root_task_group.children);
-        init_task_group.parent = &root_task_group;
-        list_add(&init_task_group.siblings, &root_task_group.children);
-#endif /* CONFIG_USER_SCHED */
-#endif /* CONFIG_GROUP_SCHED */
 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9586,25 +7776,6 @@ void __init sched_init(void)
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                 */
                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                root_task_group.shares = NICE_0_LOAD;
-                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
-                /*
-                 * In case of task-groups formed thr' the user id of tasks,
-                 * init_task_group represents tasks belonging to root user.
-                 * Hence it forms a sibling of all subsequent groups formed.
-                 * In this case, init_task_group gets only a fraction of overall
-                 * system cpu resource, based on the weight assigned to root
-                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
-                 * by letting tasks of init_task_group sit in a separate cfs_rq
-                 * (init_tg_cfs_rq) and having one entity represent this group of
-                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
-                 */
-                init_tg_cfs_entry(&init_task_group,
-                                &per_cpu(init_tg_cfs_rq, i),
-                                &per_cpu(init_sched_entity, i), i, 1,
-                                root_task_group.se[i]);
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -9613,12 +7784,6 @@ void __init sched_init(void)
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
-                init_tg_rt_entry(&init_task_group,
-                                &per_cpu(init_rt_rq_var, i),
-                                &per_cpu(init_sched_rt_entity, i), i, 1,
-                                root_task_group.rt_se[i]);
 #endif
 #endif
@@ -9703,7 +7868,7 @@ static inline int preempt_count_equals(int preempt_offset)
        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
-void __might_sleep(char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
@@ -10014,7 +8179,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
@@ -10119,11 +8284,11 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
-                enqueue_task(rq, tsk, 0);
+                enqueue_task(rq, tsk, 0, false);
        task_rq_unlock(rq, &flags);
 }
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10265,13 +8430,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
                runtime = d->rt_runtime;
        }
-#ifdef CONFIG_USER_SCHED
-        if (tg == &root_task_group) {
-                period = global_rt_period();
-                runtime = global_rt_runtime();
-        }
-#endif
        /*
         * Cannot have more runtime than the period.
         */
@@ -10674,7 +8832,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 *cpuusage;
+        u64 __percpu *cpuusage;
        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
@@ -10891,12 +9049,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 }
 /*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH   \
+        min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH   0
+#endif
+/*
 * Charge the system/user time to the task's accounting group.
 */
 static void cpuacct_update_stats(struct task_struct *tsk,
                enum cpuacct_stat_index idx, cputime_t val)
 {
        struct cpuacct *ca;
+        int batch = CPUACCT_BATCH;
        if (unlikely(!cpuacct_subsys.active))
                return;
@@ -10905,7 +9081,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
        ca = task_ca(tsk);
        do {
-                percpu_counter_add(&ca->cpustat[idx], val);
+                __percpu_counter_add(&ca->cpustat[idx], val, batch);
                ca = ca->parent;
        } while (ca);
        rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
 *  of the License.
 */
+#include <linux/gfp.h>
 #include "sched_cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,9 +48,7 @@ static int convert_prio(int prio)
 }
 #define for_each_cpupri_active(array, idx)                    \
-  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
-       idx < CPUPRI_NR_PRIORITIES;                            \
-       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +57,7 @@ static int convert_prio(int prio)
 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
- * current invokation.  By the time the call returns, the CPUs may have in
+ * current invocation.  By the time the call returns, the CPUs may have in
 * fact changed priorities any number of times.  While not ideal, it is not
 * an issue of correctness since the normal rebalancer logic will correct
 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..19be00ba6123 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        {
                char path[64];
+                rcu_read_lock();
                cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+                rcu_read_unlock();
                SEQ_printf(m, " %s", path);
        }
 #endif
@@ -518,8 +520,4 @@ void proc_sched_set_task(struct task_struct *p)
        p->se.nr_wakeups_idle                   = 0;
        p->sched_info.bkl_count                 = 0;
 #endif
-        p->se.sum_exec_runtime                  = 0;
-        p->se.prev_sum_exec_runtime             = 0;
-        p->nvcsw                                = 0;
-        p->nivcsw                               = 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..5a5ea2cd924f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 */
 /*
- * Load-balancing iterator. Note: while the runqueue stays locked
+ * pull_task - move a task from a remote runqueue to the local runqueue.
- * during the whole iteration, the current task might be
+ * Both runqueues must be locked.
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
 */
-static struct task_struct *
+static void pull_task(struct rq *src_rq, struct task_struct *p,
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
+                      struct rq *this_rq, int this_cpu)
 {
-        struct task_struct *p = NULL;
+        deactivate_task(src_rq, p, 0);
-        struct sched_entity *se;
+        set_task_cpu(p, this_cpu);
+        activate_task(this_rq, p, 0);
+        check_preempt_curr(this_rq, p, 0);
+}
-        if (next == &cfs_rq->tasks)
+/*
-                return NULL;
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
+                     int *all_pinned)
+{
+        int tsk_cache_hot = 0;
+        /*
+         * We do not migrate tasks that are:
+         * 1) running (obviously), or
+         * 2) cannot be migrated to this CPU due to cpus_allowed, or
+         * 3) are cache-hot on their current CPU.
+         */
+        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+                schedstat_inc(p, se.nr_failed_migrations_affine);
+                return 0;
+        }
+        *all_pinned = 0;
-        se = list_entry(next, struct sched_entity, group_node);
+        if (task_running(rq, p)) {
-        p = task_of(se);
+                schedstat_inc(p, se.nr_failed_migrations_running);
-        cfs_rq->balance_iterator = next->next;
+                return 0;
+        }
-        return p;
+        /*
-}
+         * Aggressive migration if:
+         * 1) task is cache cold, or
+         * 2) too many balance attempts have failed.
+         */
-static struct task_struct *load_balance_start_fair(void *arg)
+        tsk_cache_hot = task_hot(p, rq->clock, sd);
-{
+        if (!tsk_cache_hot ||
-        struct cfs_rq *cfs_rq = arg;
+                sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(sd, lb_hot_gained[idle]);
+                        schedstat_inc(p, se.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
-        return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
+        if (tsk_cache_hot) {
+                schedstat_inc(p, se.nr_failed_migrations_hot);
+                return 0;
+        }
+        return 1;
 }
-static struct task_struct *load_balance_next_fair(void *arg)
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+              struct sched_domain *sd, enum cpu_idle_type idle)
 {
-        struct cfs_rq *cfs_rq = arg;
+        struct task_struct *p, *n;
+        struct cfs_rq *cfs_rq;
+        int pinned = 0;
+        for_each_leaf_cfs_rq(busiest, cfs_rq) {
+                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                        if (!can_migrate_task(p, busiest, this_cpu,
+                                                sd, idle, &pinned))
+                                continue;
-        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
+                        pull_task(busiest, p, this_rq, this_cpu);
+                        /*
+                         * Right now, this is only the second place pull_task()
+                         * is called, so we can safely collect pull_task()
+                         * stats here rather than inside pull_task().
+                         */
+                        schedstat_inc(sd, lb_gained[idle]);
+                        return 1;
+                }
+        }
+        return 0;
 }
 static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move, struct sched_domain *sd,
+              unsigned long max_load_move, struct sched_domain *sd,
-                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+              enum cpu_idle_type idle, int *all_pinned,
-                struct cfs_rq *cfs_rq)
+              int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-        struct rq_iterator cfs_rq_iterator;
+        int loops = 0, pulled = 0, pinned = 0;
+        long rem_load_move = max_load_move;
+        struct task_struct *p, *n;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        if (max_load_move == 0)
-        cfs_rq_iterator.next = load_balance_next_fair;
+                goto out;
-        cfs_rq_iterator.arg = cfs_rq;
-        return balance_tasks(this_rq, this_cpu, busiest,
+        pinned = 1;
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &cfs_rq_iterator);
+        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+                if (loops++ > sysctl_sched_nr_migrate)
+                        break;
+                if ((p->se.load.weight >> 1) > rem_load_move ||
+                    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+                        continue;
+                pull_task(busiest, p, this_rq, this_cpu);
+                pulled++;
+                rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+                /*
+                 * NEWIDLE balancing is a source of latency, so preemptible
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
+                 */
+                if (idle == CPU_NEWLY_IDLE)
+                        break;
+#endif
+                /*
+                 * We only want to steal up to the prescribed amount of
+                 * weighted load.
+                 */
+                if (rem_load_move <= 0)
+                        break;
+                if (p->prio < *this_best_prio)
+                        *this_best_prio = p->prio;
+        }
+out:
+        /*
+         * Right now, this is one of only two places pull_task() is called,
+         * so we can safely collect pull_task() stats here rather than
+         * inside pull_task().
+         */
+        schedstat_add(sd, lb_gained[idle], pulled);
+        if (all_pinned)
+                *all_pinned = pinned;
+        return max_load_move - rem_load_move;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                rem_load = (u64)rem_load_move * busiest_weight;
                rem_load = div_u64(rem_load, busiest_h_load + 1);
-                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                moved_load = balance_tasks(this_rq, this_cpu, busiest,
                                rem_load, sd, idle, all_pinned, this_best_prio,
-                                tg->cfs_rq[busiest_cpu]);
+                                busiest_cfs_rq);
                if (!moved_load)
                        continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
-        return __load_balance_fair(this_rq, this_cpu, busiest,
+        return balance_tasks(this_rq, this_cpu, busiest,
                        max_load_move, sd, idle, all_pinned,
                        this_best_prio, &busiest->cfs);
 }
 #endif
-static int
+/*
-move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
-                   struct sched_domain *sd, enum cpu_idle_type idle)
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                      unsigned long max_load_move,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
+                      int *all_pinned)
 {
-        struct cfs_rq *busy_cfs_rq;
+        unsigned long total_load_moved = 0, load_moved;
-        struct rq_iterator cfs_rq_iterator;
+        int this_best_prio = this_rq->curr->prio;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        do {
-        cfs_rq_iterator.next = load_balance_next_fair;
+                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
+                                max_load_move - total_load_moved,
+                                sd, idle, all_pinned, &this_best_prio);
-        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+                total_load_moved += load_moved;
+#ifdef CONFIG_PREEMPT
                /*
-                 * pass busy_cfs_rq argument into
+                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * load_balance_[start|next]_fair iterators
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
                 */
-                cfs_rq_iterator.arg = busy_cfs_rq;
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+                        break;
-                                       &cfs_rq_iterator))
-                    return 1;
+                if (raw_spin_is_contended(&this_rq->lock) ||
+                                raw_spin_is_contended(&busiest->lock))
+                        break;
+#endif
+        } while (load_moved && max_load_move > total_load_moved);
+        return total_load_moved > 0;
+}
+/********** Helpers for find_busiest_group ************************/
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *              during load balancing.
+ */
+struct sd_lb_stats {
+        struct sched_group *busiest; /* Busiest group in this sd */
+        struct sched_group *this;  /* Local group in this sd */
+        unsigned long total_load;  /* Total load of all groups in sd */
+        unsigned long total_pwr;   /*   Total power of all groups in sd */
+        unsigned long avg_load;    /* Average load across all groups in sd */
+        /** Statistics of this group */
+        unsigned long this_load;
+        unsigned long this_load_per_task;
+        unsigned long this_nr_running;
+        /* Statistics of the busiest group */
+        unsigned long max_load;
+        unsigned long busiest_load_per_task;
+        unsigned long busiest_nr_running;
+        unsigned long busiest_group_capacity;
+        int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+        int power_savings_balance; /* Is powersave balance needed for this sd */
+        struct sched_group *group_min; /* Least loaded group in sd */
+        struct sched_group *group_leader; /* Group which relieves group_min */
+        unsigned long min_load_per_task; /* load_per_task in group_min */
+        unsigned long leader_nr_running; /* Nr running of group_leader */
+        unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+        unsigned long avg_load; /*Avg load across the CPUs of the group */
+        unsigned long group_load; /* Total load over the CPUs of the group */
+        unsigned long sum_nr_running; /* Nr tasks running in the group */
+        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+        unsigned long group_capacity;
+        int group_imb; /* Is there an imbalance in the group ? */
+};
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+        return cpumask_first(sched_group_cpus(group));
+}
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+                                        enum cpu_idle_type idle)
+{
+        int load_idx;
+        switch (idle) {
+        case CPU_NOT_IDLE:
+                load_idx = sd->busy_idx;
+                break;
+        case CPU_NEWLY_IDLE:
+                load_idx = sd->newidle_idx;
+                break;
+        default:
+                load_idx = sd->idle_idx;
+                break;
        }
+        return load_idx;
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        /*
+         * Busy processors will not participate in power savings
+         * balance.
+         */
+        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                sds->power_savings_balance = 0;
+        else {
+                sds->power_savings_balance = 1;
+                sds->min_nr_running = ULONG_MAX;
+                sds->leader_nr_running = 0;
+        }
+}
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *              load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        if (!sds->power_savings_balance)
+                return;
+        /*
+         * If the local group is idle or completely loaded
+         * no need to do power savings balance at this domain
+         */
+        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                                !sds->this_nr_running))
+                sds->power_savings_balance = 0;
+        /*
+         * If a group is already running at full capacity or idle,
+         * don't include that group in power savings calculations
+         */
+        if (!sds->power_savings_balance ||
+                sgs->sum_nr_running >= sgs->group_capacity ||
+                !sgs->sum_nr_running)
+                return;
+        /*
+         * Calculate the group which has the least non-idle load.
+         * This is the group from where we need to pick up the load
+         * for saving power
+         */
+        if ((sgs->sum_nr_running < sds->min_nr_running) ||
+            (sgs->sum_nr_running == sds->min_nr_running &&
+             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+                sds->group_min = group;
+                sds->min_nr_running = sgs->sum_nr_running;
+                sds->min_load_per_task = sgs->sum_weighted_load /
+                                                sgs->sum_nr_running;
+        }
+        /*
+         * Calculate the group which is almost near its
+         * capacity but still has some space to pick up some load
+         * from other group and save more power
+         */
+        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
+                return;
+        if (sgs->sum_nr_running > sds->leader_nr_running ||
+            (sgs->sum_nr_running == sds->leader_nr_running &&
+             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+                sds->group_leader = group;
+                sds->leader_nr_running = sgs->sum_nr_running;
+        }
+}
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *      under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
+        if (!sds->power_savings_balance)
+                return 0;
+        if (sds->this != sds->group_leader ||
+                        sds->group_leader == sds->group_min)
+                return 0;
+        *imbalance = sds->min_load_per_task;
+        sds->busiest = sds->group_min;
+        return 1;
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        return;
+}
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        return;
+}
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
        return 0;
 }
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return SCHED_LOAD_SCALE;
+}
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_freq_power(sd, cpu);
+}
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long smt_gain = sd->smt_gain;
+        smt_gain /= weight;
+        return smt_gain;
+}
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_smt_power(sd, cpu);
+}
+unsigned long scale_rt_power(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        u64 total, available;
+        sched_avg_update(rq);
+        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        available = total - rq->rt_avg;
+        if (unlikely((s64)total < SCHED_LOAD_SCALE))
+                total = SCHED_LOAD_SCALE;
+        total >>= SCHED_LOAD_SHIFT;
+        return div_u64(available, total);
+}
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long power = SCHED_LOAD_SCALE;
+        struct sched_group *sdg = sd->groups;
+        if (sched_feat(ARCH_POWER))
+                power *= arch_scale_freq_power(sd, cpu);
+        else
+                power *= default_scale_freq_power(sd, cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+                if (sched_feat(ARCH_POWER))
+                        power *= arch_scale_smt_power(sd, cpu);
+                else
+                        power *= default_scale_smt_power(sd, cpu);
+                power >>= SCHED_LOAD_SHIFT;
+        }
+        power *= scale_rt_power(cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if (!power)
+                power = 1;
+        sdg->cpu_power = power;
+}
+static void update_group_power(struct sched_domain *sd, int cpu)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group, *sdg = sd->groups;
+        unsigned long power;
+        if (!child) {
+                update_cpu_power(sd, cpu);
+                return;
+        }
+        power = 0;
+        group = child->groups;
+        do {
+                power += group->cpu_power;
+                group = group->next;
+        } while (group != child->groups);
+        sdg->cpu_power = power;
+}
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+                        struct sched_group *group, int this_cpu,
+                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                        int local_group, const struct cpumask *cpus,
+                        int *balance, struct sg_lb_stats *sgs)
+{
+        unsigned long load, max_cpu_load, min_cpu_load;
+        int i;
+        unsigned int balance_cpu = -1, first_idle_cpu = 0;
+        unsigned long avg_load_per_task = 0;
+        if (local_group)
+                balance_cpu = group_first_cpu(group);
+        /* Tally up the load of all CPUs in the group */
+        max_cpu_load = 0;
+        min_cpu_load = ~0UL;
+        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                struct rq *rq = cpu_rq(i);
+                if (*sd_idle && rq->nr_running)
+                        *sd_idle = 0;
+                /* Bias balancing toward cpus of our domain */
+                if (local_group) {
+                        if (idle_cpu(i) && !first_idle_cpu) {
+                                first_idle_cpu = 1;
+                                balance_cpu = i;
+                        }
+                        load = target_load(i, load_idx);
+                } else {
+                        load = source_load(i, load_idx);
+                        if (load > max_cpu_load)
+                                max_cpu_load = load;
+                        if (min_cpu_load > load)
+                                min_cpu_load = load;
+                }
+                sgs->group_load += load;
+                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_weighted_load += weighted_cpuload(i);
+        }
+        /*
+         * First idle cpu or the first cpu(busiest) in this sched group
+         * is eligible for doing load balancing at this and above
+         * domains. In the newly idle case, we will allow all the cpu's
+         * to do the newly idle load balance.
+         */
+        if (idle != CPU_NEWLY_IDLE && local_group &&
+            balance_cpu != this_cpu) {
+                *balance = 0;
+                return;
+        }
+        update_group_power(sd, this_cpu);
+        /* Adjust by relative CPU power of the group */
+        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+        /*
+         * Consider the group unbalanced when the imbalance is larger
+         * than the average weight of two tasks.
+         *
+         * APZ: with cgroup the avg task weight can vary wildly and
+         *      might not be a suitable number - should we keep a
+         *      normalized nr_running number somewhere that negates
+         *      the hierarchy?
+         */
+        if (sgs->sum_nr_running)
+                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+                sgs->group_imb = 1;
+        sgs->group_capacity =
+                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+}
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                        enum cpu_idle_type idle, int *sd_idle,
+                        const struct cpumask *cpus, int *balance,
+                        struct sd_lb_stats *sds)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group = sd->groups;
+        struct sg_lb_stats sgs;
+        int load_idx, prefer_sibling = 0;
+        if (child && child->flags & SD_PREFER_SIBLING)
+                prefer_sibling = 1;
+        init_sd_power_savings_stats(sd, sds, idle);
+        load_idx = get_sd_load_idx(sd, idle);
+        do {
+                int local_group;
+                local_group = cpumask_test_cpu(this_cpu,
+                                               sched_group_cpus(group));
+                memset(&sgs, 0, sizeof(sgs));
+                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+                                local_group, cpus, balance, &sgs);
+                if (local_group && !(*balance))
+                        return;
+                sds->total_load += sgs.group_load;
+                sds->total_pwr += group->cpu_power;
+                /*
+                 * In case the child domain prefers tasks go to siblings
+                 * first, lower the group capacity to one so that we'll try
+                 * and move all the excess tasks away.
+                 */
+                if (prefer_sibling)
+                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
+                if (local_group) {
+                        sds->this_load = sgs.avg_load;
+                        sds->this = group;
+                        sds->this_nr_running = sgs.sum_nr_running;
+                        sds->this_load_per_task = sgs.sum_weighted_load;
+                } else if (sgs.avg_load > sds->max_load &&
+                           (sgs.sum_nr_running > sgs.group_capacity ||
+                                sgs.group_imb)) {
+                        sds->max_load = sgs.avg_load;
+                        sds->busiest = group;
+                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_group_capacity = sgs.group_capacity;
+                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->group_imb = sgs.group_imb;
+                }
+                update_sd_power_savings_stats(group, sds, local_group, &sgs);
+                group = group->next;
+        } while (group != sd->groups);
+}
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *                      amongst the groups of a sched_domain, during
+ *                      load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                                int this_cpu, unsigned long *imbalance)
+{
+        unsigned long tmp, pwr_now = 0, pwr_move = 0;
+        unsigned int imbn = 2;
+        unsigned long scaled_busy_load_per_task;
+        if (sds->this_nr_running) {
+                sds->this_load_per_task /= sds->this_nr_running;
+                if (sds->busiest_load_per_task >
+                                sds->this_load_per_task)
+                        imbn = 1;
+        } else
+                sds->this_load_per_task =
+                        cpu_avg_load_per_task(this_cpu);
+        scaled_busy_load_per_task = sds->busiest_load_per_task
+                                                 * SCHED_LOAD_SCALE;
+        scaled_busy_load_per_task /= sds->busiest->cpu_power;
+        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+                        (scaled_busy_load_per_task * imbn)) {
+                *imbalance = sds->busiest_load_per_task;
+                return;
+        }
+        /*
+         * OK, we don't have enough imbalance to justify moving tasks,
+         * however we may be able to increase total CPU power used by
+         * moving them.
+         */
+        pwr_now += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load);
+        pwr_now += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load);
+        pwr_now /= SCHED_LOAD_SCALE;
+        /* Amount of load we'd subtract */
+        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                sds->busiest->cpu_power;
+        if (sds->max_load > tmp)
+                pwr_move += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load - tmp);
+        /* Amount of load we'd add */
+        if (sds->max_load * sds->busiest->cpu_power <
+                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+                tmp = (sds->max_load * sds->busiest->cpu_power) /
+                        sds->this->cpu_power;
+        else
+                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                        sds->this->cpu_power;
+        pwr_move += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load + tmp);
+        pwr_move /= SCHED_LOAD_SCALE;
+        /* Move if we gain throughput */
+        if (pwr_move > pwr_now)
+                *imbalance = sds->busiest_load_per_task;
+}
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                       groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+                unsigned long *imbalance)
+{
+        unsigned long max_pull, load_above_capacity = ~0UL;
+        sds->busiest_load_per_task /= sds->busiest_nr_running;
+        if (sds->group_imb) {
+                sds->busiest_load_per_task =
+                        min(sds->busiest_load_per_task, sds->avg_load);
+        }
+        /*
+         * In the presence of smp nice balancing, certain scenarios can have
+         * max load less than avg load(as we skip the groups at or below
+         * its cpu_power, while calculating max_load..)
+         */
+        if (sds->max_load < sds->avg_load) {
+                *imbalance = 0;
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+        }
+        if (!sds->group_imb) {
+                /*
+                 * Don't want to pull so many tasks that a group would go idle.
+                 */
+                load_above_capacity = (sds->busiest_nr_running -
+                                                sds->busiest_group_capacity);
+                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+                load_above_capacity /= sds->busiest->cpu_power;
+        }
+        /*
+         * We're trying to get all the cpus to the average_load, so we don't
+         * want to push ourselves above the average load, nor do we wish to
+         * reduce the max loaded cpu below the average load. At the same time,
+         * we also don't want to reduce the group load below the group capacity
+         * (so that we can implement power-savings policies etc). Thus we look
+         * for the minimum possible imbalance.
+         * Be careful of negative numbers as they'll appear as very large values
+         * with unsigned longs.
+         */
+        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+        /* How much load to actually move to equalise the imbalance */
+        *imbalance = min(max_pull * sds->busiest->cpu_power,
+                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+                        / SCHED_LOAD_SCALE;
+        /*
+         * if *imbalance is less than the average load per runnable task
+         * there is no gaurantee that any tasks will be moved so we'll have
+         * a think about bumping its value to force at least one task to be
+         * moved
+         */
+        if (*imbalance < sds->busiest_load_per_task)
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+}
+/******* find_busiest_group() helpers end here *********************/
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *              be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *      is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:     - the busiest group if imbalance exists.
+ *              - If no imbalance and user has opted for power-savings balance,
+ *                 return the least loaded group whose CPUs can be
+ *                 put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                   unsigned long *imbalance, enum cpu_idle_type idle,
+                   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+        struct sd_lb_stats sds;
+        memset(&sds, 0, sizeof(sds));
+        /*
+         * Compute the various statistics relavent for load balancing at
+         * this level.
+         */
+        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                        balance, &sds);
+        /* Cases where imbalance does not exist from POV of this_cpu */
+        /* 1) this_cpu is not the appropriate cpu to perform load balancing
+         *    at this level.
+         * 2) There is no busy sibling group to pull from.
+         * 3) This group is the busiest group.
+         * 4) This group is more busy than the avg busieness at this
+         *    sched_domain.
+         * 5) The imbalance is within the specified limit.
+         */
+        if (!(*balance))
+                goto ret;
+        if (!sds.busiest || sds.busiest_nr_running == 0)
+                goto out_balanced;
+        if (sds.this_load >= sds.max_load)
+                goto out_balanced;
+        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        if (sds.this_load >= sds.avg_load)
+                goto out_balanced;
+        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                goto out_balanced;
+        /* Looks like there is an imbalance. Compute it */
+        calculate_imbalance(&sds, this_cpu, imbalance);
+        return sds.busiest;
+out_balanced:
+        /*
+         * There is no obvious imbalance. But check if we can do some balancing
+         * to save power.
+         */
+        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+                return sds.busiest;
+ret:
+        *imbalance = 0;
+        return NULL;
+}
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
+                   unsigned long imbalance, const struct cpumask *cpus)
+{
+        struct rq *busiest = NULL, *rq;
+        unsigned long max_load = 0;
+        int i;
+        for_each_cpu(i, sched_group_cpus(group)) {
+                unsigned long power = power_of(i);
+                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                unsigned long wl;
+                if (!cpumask_test_cpu(i, cpus))
+                        continue;
+                rq = cpu_rq(i);
+                wl = weighted_cpuload(i);
+                /*
+                 * When comparing with imbalance, use weighted_cpuload()
+                 * which is not scaled with the cpu power.
+                 */
+                if (capacity && rq->nr_running == 1 && wl > imbalance)
+                        continue;
+                /*
+                 * For the load comparisons with the other cpu's, consider
+                 * the weighted_cpuload() scaled with the cpu power, so that
+                 * the load can be moved away from the cpu that is potentially
+                 * running at a lower capacity.
+                 */
+                wl = (wl * SCHED_LOAD_SCALE) / power;
+                if (wl > max_load) {
+                        max_load = wl;
+                        busiest = rq;
+                }
+        }
+        return busiest;
+}
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL     512
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+{
+        if (idle == CPU_NEWLY_IDLE) {
+                /*
+                 * The only task running in a non-idle cpu can be moved to this
+                 * cpu in an attempt to completely freeup the other CPU
+                 * package.
+                 *
+                 * The package power saving logic comes from
+                 * find_busiest_group(). If there are no imbalance, then
+                 * f_b_g() will return NULL. However when sched_mc={1,2} then
+                 * f_b_g() will select a group from which a running task may be
+                 * pulled to this cpu in order to make the other package idle.
+                 * If there is no opportunity to make a package idle and if
+                 * there are no imbalance, then f_b_g() will return NULL and no
+                 * action will be taken in load_balance_newidle().
+                 *
+                 * Under normal task pull operation due to imbalance, there
+                 * will be more than one task in the source run queue and
+                 * move_tasks() will succeed.  ld_moved will be true and this
+                 * active balance code will not be triggered.
+                 */
+                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                        return 0;
+                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                        return 0;
+        }
+        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *balance)
+{
+        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+        struct sched_group *group;
+        unsigned long imbalance;
+        struct rq *busiest;
+        unsigned long flags;
+        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+        cpumask_copy(cpus, cpu_active_mask);
+        /*
+         * When power savings policy is enabled for the parent domain, idle
+         * sibling can pick up load irrespective of busy siblings. In this case,
+         * let the state of idle sibling percolate up as CPU_IDLE, instead of
+         * portraying it as CPU_NOT_IDLE.
+         */
+        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                sd_idle = 1;
+        schedstat_inc(sd, lb_count[idle]);
+redo:
+        update_shares(sd);
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+                                   cpus, balance);
+        if (*balance == 0)
+                goto out_balanced;
+        if (!group) {
+                schedstat_inc(sd, lb_nobusyg[idle]);
+                goto out_balanced;
+        }
+        busiest = find_busiest_queue(group, idle, imbalance, cpus);
+        if (!busiest) {
+                schedstat_inc(sd, lb_nobusyq[idle]);
+                goto out_balanced;
+        }
+        BUG_ON(busiest == this_rq);
+        schedstat_add(sd, lb_imbalance[idle], imbalance);
+        ld_moved = 0;
+        if (busiest->nr_running > 1) {
+                /*
+                 * Attempt to move tasks. If find_busiest_group has found
+                 * an imbalance but busiest->nr_running <= 1, the group is
+                 * still unbalanced. ld_moved simply stays zero, so it is
+                 * correctly treated as an imbalance.
+                 */
+                local_irq_save(flags);
+                double_rq_lock(this_rq, busiest);
+                ld_moved = move_tasks(this_rq, this_cpu, busiest,
+                                      imbalance, sd, idle, &all_pinned);
+                double_rq_unlock(this_rq, busiest);
+                local_irq_restore(flags);
+                /*
+                 * some other cpu did the load balance for us.
+                 */
+                if (ld_moved && this_cpu != smp_processor_id())
+                        resched_cpu(this_cpu);
+                /* All tasks on this runqueue were pinned by CPU affinity */
+                if (unlikely(all_pinned)) {
+                        cpumask_clear_cpu(cpu_of(busiest), cpus);
+                        if (!cpumask_empty(cpus))
+                                goto redo;
+                        goto out_balanced;
+                }
+        }
+        if (!ld_moved) {
+                schedstat_inc(sd, lb_failed[idle]);
+                sd->nr_balance_failed++;
+                if (need_active_balance(sd, sd_idle, idle)) {
+                        raw_spin_lock_irqsave(&busiest->lock, flags);
+                        /* don't kick the migration_thread, if the curr
+                         * task on busiest cpu can't be moved to this_cpu
+                         */
+                        if (!cpumask_test_cpu(this_cpu,
+                                              &busiest->curr->cpus_allowed)) {
+                                raw_spin_unlock_irqrestore(&busiest->lock,
+                                                            flags);
+                                all_pinned = 1;
+                                goto out_one_pinned;
+                        }
+                        if (!busiest->active_balance) {
+                                busiest->active_balance = 1;
+                                busiest->push_cpu = this_cpu;
+                                active_balance = 1;
+                        }
+                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
+                        if (active_balance)
+                                wake_up_process(busiest->migration_thread);
+                        /*
+                         * We've kicked active balancing, reset the failure
+                         * counter.
+                         */
+                        sd->nr_balance_failed = sd->cache_nice_tries+1;
+                }
+        } else
+                sd->nr_balance_failed = 0;
+        if (likely(!active_balance)) {
+                /* We were unbalanced, so reset the balancing interval */
+                sd->balance_interval = sd->min_interval;
+        } else {
+                /*
+                 * If we've begun active balancing, start to back off. This
+                 * case may not be covered by the all_pinned logic if there
+                 * is only 1 task on the busy runqueue (because we don't call
+                 * move_tasks).
+                 */
+                if (sd->balance_interval < sd->max_interval)
+                        sd->balance_interval *= 2;
+        }
+        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        goto out;
+out_balanced:
+        schedstat_inc(sd, lb_balanced[idle]);
+        sd->nr_balance_failed = 0;
+out_one_pinned:
+        /* tune up the balancing interval */
+        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                        (sd->balance_interval < sd->max_interval))
+                sd->balance_interval *= 2;
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        else
+                ld_moved = 0;
+out:
+        if (ld_moved)
+                update_shares(sd);
+        return ld_moved;
+}
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, struct rq *this_rq)
+{
+        struct sched_domain *sd;
+        int pulled_task = 0;
+        unsigned long next_balance = jiffies + HZ;
+        this_rq->idle_stamp = this_rq->clock;
+        if (this_rq->avg_idle < sysctl_sched_migration_cost)
+                return;
+        /*
+         * Drop the rq->lock, but keep IRQ/preempt disabled.
+         */
+        raw_spin_unlock(&this_rq->lock);
+        for_each_domain(this_cpu, sd) {
+                unsigned long interval;
+                int balance = 1;
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                        /* If we've pulled tasks over stop searching: */
+                        pulled_task = load_balance(this_cpu, this_rq,
+                                                   sd, CPU_NEWLY_IDLE, &balance);
+                }
+                interval = msecs_to_jiffies(sd->balance_interval);
+                if (time_after(next_balance, sd->last_balance + interval))
+                        next_balance = sd->last_balance + interval;
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
+                        break;
+                }
+        }
+        raw_spin_lock(&this_rq->lock);
+        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+                /*
+                 * We are going idle. next_balance may be set based on
+                 * a busy processor. So reset next_balance.
+                 */
+                this_rq->next_balance = next_balance;
+        }
+}
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+{
+        int target_cpu = busiest_rq->push_cpu;
+        struct sched_domain *sd;
+        struct rq *target_rq;
+        /* Is there any task to move? */
+        if (busiest_rq->nr_running <= 1)
+                return;
+        target_rq = cpu_rq(target_cpu);
+        /*
+         * This condition is "impossible", if it occurs
+         * we need to fix it. Originally reported by
+         * Bjorn Helgaas on a 128-cpu setup.
+         */
+        BUG_ON(busiest_rq == target_rq);
+        /* move a task from busiest_rq to target_rq */
+        double_lock_balance(busiest_rq, target_rq);
+        update_rq_clock(busiest_rq);
+        update_rq_clock(target_rq);
+        /* Search for an sd spanning us and the target CPU. */
+        for_each_domain(target_cpu, sd) {
+                if ((sd->flags & SD_LOAD_BALANCE) &&
+                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                                break;
+        }
+        if (likely(sd)) {
+                schedstat_inc(sd, alb_count);
+                if (move_one_task(target_rq, target_cpu, busiest_rq,
+                                  sd, CPU_IDLE))
+                        schedstat_inc(sd, alb_pushed);
+                else
+                        schedstat_inc(sd, alb_failed);
+        }
+        double_unlock_balance(busiest_rq, target_rq);
+}
+#ifdef CONFIG_NO_HZ
+static struct {
+        atomic_t load_balancer;
+        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
+} nohz ____cacheline_aligned = {
+        .load_balancer = ATOMIC_INIT(-1),
+};
+int get_nohz_load_balancer(void)
+{
+        return atomic_read(&nohz.load_balancer);
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
+        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
+/*
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
+ *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+        int cpu = smp_processor_id();
+        if (stop_tick) {
+                cpu_rq(cpu)->in_nohz_recently = 1;
+                if (!cpu_active(cpu)) {
+                        if (atomic_read(&nohz.load_balancer) != cpu)
+                                return 0;
+                        /*
+                         * If we are going offline and still the leader,
+                         * give up!
+                         */
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+                        return 0;
+                }
+                cpumask_set_cpu(cpu, nohz.cpu_mask);
+                /* time for ilb owner also to sleep */
+                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
+                        if (atomic_read(&nohz.load_balancer) == cpu)
+                                atomic_set(&nohz.load_balancer, -1);
+                        return 0;
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        /* make me the ilb owner */
+                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+                                return 1;
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
+                        return 1;
+                }
+        } else {
+                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+                        return 0;
+                cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                if (atomic_read(&nohz.load_balancer) == cpu)
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+        }
+        return 0;
+}
+#endif
+static DEFINE_SPINLOCK(balancing);
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+{
+        int balance = 1;
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long interval;
+        struct sched_domain *sd;
+        /* Earliest time when we have to do rebalance again */
+        unsigned long next_balance = jiffies + 60*HZ;
+        int update_next_balance = 0;
+        int need_serialize;
+        for_each_domain(cpu, sd) {
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                interval = sd->balance_interval;
+                if (idle != CPU_IDLE)
+                        interval *= sd->busy_factor;
+                /* scale ms to jiffies */
+                interval = msecs_to_jiffies(interval);
+                if (unlikely(!interval))
+                        interval = 1;
+                if (interval > HZ*NR_CPUS/10)
+                        interval = HZ*NR_CPUS/10;
+                need_serialize = sd->flags & SD_SERIALIZE;
+                if (need_serialize) {
+                        if (!spin_trylock(&balancing))
+                                goto out;
+                }
+                if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                        if (load_balance(cpu, rq, sd, idle, &balance)) {
+                                /*
+                                 * We've pulled tasks over so either we're no
+                                 * longer idle, or one of our SMT siblings is
+                                 * not idle.
+                                 */
+                                idle = CPU_NOT_IDLE;
+                        }
+                        sd->last_balance = jiffies;
+                }
+                if (need_serialize)
+                        spin_unlock(&balancing);
+out:
+                if (time_after(next_balance, sd->last_balance + interval)) {
+                        next_balance = sd->last_balance + interval;
+                        update_next_balance = 1;
+                }
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!balance)
+                        break;
+        }
+        /*
+         * next_balance will be updated only when there is a need.
+         * When the cpu is attached to null domain for ex, it will not be
+         * updated.
+         */
+        if (likely(update_next_balance))
+                rq->next_balance = next_balance;
+}
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+        int this_cpu = smp_processor_id();
+        struct rq *this_rq = cpu_rq(this_cpu);
+        enum cpu_idle_type idle = this_rq->idle_at_tick ?
+                                                CPU_IDLE : CPU_NOT_IDLE;
+        rebalance_domains(this_cpu, idle);
+#ifdef CONFIG_NO_HZ
+        /*
+         * If this cpu is the owner for idle load balancing, then do the
+         * balancing on behalf of the other idle cpus whose ticks are
+         * stopped.
+         */
+        if (this_rq->idle_at_tick &&
+            atomic_read(&nohz.load_balancer) == this_cpu) {
+                struct rq *rq;
+                int balance_cpu;
+                for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                        if (balance_cpu == this_cpu)
+                                continue;
+                        /*
+                         * If this cpu gets work to do, stop the load balancing
+                         * work being done for other cpus. Next load
+                         * balancing owner will pick it up.
+                         */
+                        if (need_resched())
+                                break;
+                        rebalance_domains(balance_cpu, CPU_IDLE);
+                        rq = cpu_rq(balance_cpu);
+                        if (time_after(this_rq->next_balance, rq->next_balance))
+                                this_rq->next_balance = rq->next_balance;
+                }
+        }
+#endif
+}
+static inline int on_null_domain(int cpu)
+{
+        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+}
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(struct rq *rq, int cpu)
+{
+#ifdef CONFIG_NO_HZ
+        /*
+         * If we were in the nohz mode recently and busy at the current
+         * scheduler tick, then check if we need to nominate new idle
+         * load balancer.
+         */
+        if (rq->in_nohz_recently && !rq->idle_at_tick) {
+                rq->in_nohz_recently = 0;
+                if (atomic_read(&nohz.load_balancer) == cpu) {
+                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                        atomic_set(&nohz.load_balancer, -1);
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        int ilb = find_new_ilb(cpu);
+                        if (ilb < nr_cpu_ids)
+                                resched_cpu(ilb);
+                }
+        }
+        /*
+         * If this cpu is idle and doing idle load balancing for all the
+         * cpus with ticks stopped, is it time for that to stop?
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+                resched_cpu(cpu);
+                return;
+        }
+        /*
+         * If this cpu is idle and the idle load balancing is done by
+         * someone else, then no need raise the SCHED_SOFTIRQ
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+            cpumask_test_cpu(cpu, nohz.cpu_mask))
+                return;
+#endif
+        /* Don't need to rebalance while attached to NULL domain */
+        if (time_after_eq(jiffies, rq->next_balance) &&
+            likely(!on_null_domain(cpu)))
+                raise_softirq(SCHED_SOFTIRQ);
+}
 static void rq_online_fair(struct rq *rq)
 {
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
        update_sysctl();
 }
+#else   /* CONFIG_SMP */
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 /*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
 }
 #endif
-unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
        unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
-        .load_balance           = load_balance_fair,
-        .move_one_task          = move_one_task_fair,
        .rq_online              = rq_online_fair,
        .rq_offline             = rq_offline_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
-#ifdef CONFIG_SMP
-static unsigned long
-load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
-{
-        return 0;
-}
-static int
-move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        return 0;
-}
-#endif
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                check_preempt_curr(rq, p, 0);
 }
-unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
        return 0;
 }
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
-        .load_balance           = load_balance_idle,
-        .move_one_task          = move_one_task_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..b5b920ae2ea7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return rt_se->my_q;
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        int this_cpu = smp_processor_id();
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
-                        enqueue_rt_entity(rt_se);
+                        enqueue_rt_entity(rt_se, false);
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        int this_cpu = smp_processor_id();
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_se && on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        dec_rt_group(rt_se, rt_rq);
 }
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        list_add_tail(&rt_se->run_list, queue);
+        if (head)
+                list_add(&rt_se->run_list, queue);
+        else
+                list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
        }
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
-                __enqueue_rt_entity(rt_se);
+                __enqueue_rt_entity(rt_se, head);
 }
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                struct rt_rq *rt_rq = group_rt_rq(rt_se);
                if (rt_rq && rt_rq->rt_nr_running)
-                        __enqueue_rt_entity(rt_se);
+                        __enqueue_rt_entity(rt_se, false);
        }
 }
 /*
 * Adding/removing a task to/from a priority array:
 */
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        struct sched_rt_entity *rt_se = &p->rt;
        if (wakeup)
                rt_se->timeout = 0;
-        enqueue_rt_entity(rt_se);
+        enqueue_rt_entity(rt_se, head);
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
@@ -1136,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
                if (next && next->prio < idx)
                        continue;
                list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                        struct task_struct *p = rt_task_of(rt_se);
+                        struct task_struct *p;
+                        if (!rt_entity_is_task(rt_se))
+                                continue;
+                        p = rt_task_of(rt_se);
                        if (pick_rt_task(rq, p, cpu)) {
                                next = p;
                                break;
@@ -1481,24 +1496,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
                push_rt_tasks(rq);
 }
-static unsigned long
-load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move,
-                struct sched_domain *sd, enum cpu_idle_type idle,
-                int *all_pinned, int *this_best_prio)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
-static int
-move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                 struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
 static void set_cpus_allowed_rt(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
@@ -1670,8 +1667,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
        if (!p->signal)
                return;
-        soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+        /* max may change after cur was read, this will be fixed next tick */
-        hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+        soft = task_rlimit(p, RLIMIT_RTTIME);
+        hard = task_rlimit_max(p, RLIMIT_RTTIME);
        if (soft != RLIM_INFINITY) {
                unsigned long next;
@@ -1721,7 +1719,7 @@ static void set_curr_task_rt(struct rq *rq)
        dequeue_pushable_task(rq, p);
 }
-unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
        /*
         * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1744,6 @@ static const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_rt,
-        .load_balance           = load_balance_rt,
-        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
 /* Given the mask, find the first available signal that should be serviced. */
+#define SYNCHRONOUS_MASK \
+        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
+         sigmask(SIGTRAP) | sigmask(SIGFPE))
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
        unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
        s = pending->signal.sig;
        m = mask->sig;
+        /*
+         * Handle the first word specially: it contains the
+         * synchronous signals that need to be dequeued first.
+         */
+        x = *s &~ *m;
+        if (x) {
+                if (x & SYNCHRONOUS_MASK)
+                        x &= SYNCHRONOUS_MASK;
+                sig = ffz(~x) + 1;
+                return sig;
+        }
        switch (_NSIG_WORDS) {
        default:
-                for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
+                for (i = 1; i < _NSIG_WORDS; ++i) {
-                        if ((x = *s &~ *m) != 0) {
+                        x = *++s &~ *++m;
-                                sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        if (!x)
-                                break;
+                                continue;
-                        }
+                        sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        break;
+                }
                break;
-        case 2: if ((x = s[0] &~ m[0]) != 0)
+        case 2:
-                        sig = 1;
+                x = s[1] &~ m[1];
-                else if ((x = s[1] &~ m[1]) != 0)
+                if (!x)
-                        sig = _NSIG_BPW + 1;
-                else
                        break;
-                sig += ffz(~x);
+                sig = ffz(~x) + _NSIG_BPW + 1;
                break;
-        case 1: if ((x = *s &~ *m) != 0)
+        case 1:
-                        sig = ffz(~x) + 1;
+                /* Nothing to do */
                break;
        }
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
        if (override_rlimit ||
            atomic_read(&user->sigpending) <=
-                        t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
+                        task_rlimit(t, RLIMIT_SIGPENDING)) {
                q = kmem_cache_alloc(sigqueue_cachep, flags);
        } else {
                print_dropped_signal(sig);
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
                        goto cancelled;
                /* the timer holds a reference whilst it is pending */
-                ret = work->ops->get_ref(work);
+                ret = slow_work_get_ref(work);
                if (ret < 0)
                        goto cant_get_ref;
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
 */
 static inline void slow_work_set_thread_pid(int id, pid_t pid)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_pids[id] = pid;
 #endif
 }
 static inline void slow_work_mark_time(struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        work->mark = CURRENT_TIME;
 #endif
 }
 static inline void slow_work_begin_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_execs[id] = work;
 #endif
 }
 static inline void slow_work_end_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        write_lock(&slow_work_execs_lock);
        slow_work_execs[id] = NULL;
        write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..3fc697336183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,11 +9,10 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
-static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
        struct list_head        queue;
        raw_spinlock_t          lock;
@@ -33,12 +32,14 @@ struct call_function_data {
        cpumask_var_t           cpumask;
 };
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 struct call_single_queue {
        struct list_head        list;
        raw_spinlock_t          lock;
 };
-static DEFINE_PER_CPU(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void)
        }
 }
-static DEFINE_PER_CPU(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 /*
 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
 */
 /*
- * The trampoline is called when the hrtimer expires. If this is
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * called from the hrtimer interrupt then we schedule the tasklet as
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * the timer callback function expects to run in softirq context. If
+ * hrtimer callback, but from softirq context.
- * it's called in softirq context anyway (i.e. high resolution timers
- * disabled) then the hrtimer callback is called right away.
 */
 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
 {
        struct tasklet_hrtimer *ttimer =
                container_of(timer, struct tasklet_hrtimer, timer);
-        if (hrtimer_is_hres_active(timer)) {
+        tasklet_hi_schedule(&ttimer->tasklet);
-                tasklet_hi_schedule(&ttimer->tasklet);
+        return HRTIMER_NORESTART;
-                return HRTIMER_NORESTART;
-        }
-        return ttimer->function(timer);
 }
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
 static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
 static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(bool, softlock_touch_sync);
 static int __read_mostly did_panic;
 int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_softlockup_watchdog_sync(void)
+{
+        __raw_get_cpu_var(softlock_touch_sync) = true;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
+}
 void touch_all_softlockup_watchdogs(void)
 {
        int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
        }
        if (touch_ts == 0) {
+                if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
+                        /*
+                         * If the time stamp was touched atomically
+                         * make sure the scheduler tick is up to date.
+                         */
+                        per_cpu(softlock_touch_sync, this_cpu) = false;
+                        sched_clock_tick();
+                }
                __touch_softlockup_watchdog();
                return;
        }
@@ -140,11 +155,11 @@ void softlockup_tick(void)
         * Wake up the high-prio watchdog task twice per
         * threshold timespan.
         */
-        if (now > touch_ts + softlockup_thresh/2)
+        if (time_after(now - softlockup_thresh/2, touch_ts))
                wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
        /* Warn about unreasonable delays: */
-        if (now <= (touch_ts + softlockup_thresh))
+        if (time_before_eq(now - softlockup_thresh, touch_ts))
                return;
        per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,10 +30,33 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/srcu.h>
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+        sp->completed = 0;
+        mutex_init(&sp->mutex);
+        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+        return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+                       struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /* Don't re-initialize a lock while it is held. */
+        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+        lockdep_init_map(&sp->dep_map, name, key, 0);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+        return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /**
 * init_srcu_struct - initialize a sleep-RCU structure
 * @sp: structure to initialize.
@@ -44,13 +67,12 @@
 */
 int init_srcu_struct(struct srcu_struct *sp)
 {
-        sp->completed = 0;
+        return init_srcu_struct_fields(sp);
-        mutex_init(&sp->mutex);
-        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
-        return (sp->per_cpu_ref ? 0 : -ENOMEM);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
 * srcu_readers_active_idx -- returns approximate number of readers
 *      active on the specified rank of per-CPU counters.
@@ -100,15 +122,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-/**
+/*
- * srcu_read_lock - register a new reader for an SRCU-protected structure.
- * @sp: srcu_struct in which to register the new reader.
- *
 * Counts the new reader in the appropriate per-CPU element of the
 * srcu_struct.  Must be called from process context.
 * Returns an index that must be passed to the matching srcu_read_unlock().
 */
-int srcu_read_lock(struct srcu_struct *sp)
+int __srcu_read_lock(struct srcu_struct *sp)
 {
        int idx;
@@ -120,31 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp)
        preempt_enable();
        return idx;
 }
-EXPORT_SYMBOL_GPL(srcu_read_lock);
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
-/**
+/*
- * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
- * @sp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock().
- *
 * Removes the count for the old reader from the appropriate per-CPU
 * element of the srcu_struct.  Note that this may well be a different
 * CPU than that which was incremented by the corresponding srcu_read_lock().
 * Must be called from process context.
 */
-void srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
        preempt_disable();
        srcu_barrier();  /* ensure compiler won't misorder critical section. */
        per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
        preempt_enable();
 }
-EXPORT_SYMBOL_GPL(srcu_read_unlock);
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 /*
 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
 */
-void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
        int idx;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
 static struct workqueue_struct *stop_machine_wq;
 static struct stop_machine_data active, idle;
 static const struct cpumask *active_cpus;
-static void *stop_machine_work;
+static void __percpu *stop_machine_work;
 static void set_state(enum stopmachine_state newstate)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..7cb426a58965 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,8 +33,10 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
+#include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/gfp.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -222,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;
+        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
                case PRIO_PROCESS:
@@ -267,6 +270,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        }
 out_unlock:
        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -569,13 +573,7 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
-        if (!task_can_switch_user(new_user, current)) {
+        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                free_uid(new_user);
-                return -EINVAL;
-        }
-        if (atomic_read(&new_user->processes) >=
-                                current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
                        new_user != INIT_USER) {
                free_uid(new_user);
                return -EAGAIN;
@@ -1118,6 +1116,15 @@ out:
 DECLARE_RWSEM(uts_sem);
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+        (personality(current->personality) == PER_LINUX32 && \
+         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+                      sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name)     0
+#endif
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1126,9 +1133,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
        if (copy_to_user(name, utsname(), sizeof *name))
                errno = -EFAULT;
        up_read(&uts_sem);
+        if (!errno && override_architecture(name))
+                errno = -EFAULT;
        return errno;
 }
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+        int error = 0;
+        if (!name)
+                return -EFAULT;
+        down_read(&uts_sem);
+        if (copy_to_user(name, utsname(), sizeof(*name)))
+                error = -EFAULT;
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error;
+}
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+        int error;
+        if (!name)
+                return -EFAULT;
+        if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+                return -EFAULT;
+        down_read(&uts_sem);
+        error = __copy_to_user(&name->sysname, &utsname()->sysname,
+                               __OLD_UTS_LEN);
+        error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->release, &utsname()->release,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->release + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->version, &utsname()->version,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->version + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->machine, &utsname()->machine,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error ? -EFAULT : 0;
+}
+#endif
 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 {
        int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ac72c9e6bd9b..a38af430f0d8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -50,6 +51,7 @@
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
+#include <linux/kprobes.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -59,6 +61,18 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
 #ifdef CONFIG_NMI_WATCHDOG
 #include <linux/nmi.h>
@@ -68,8 +82,6 @@
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
-extern int C_A_D;
-extern int print_fatal_signals;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
@@ -91,9 +103,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-extern int rcutorture_runnable;
-#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 #ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
 #endif
@@ -123,14 +132,6 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_MODULES
-extern char modprobe_path[];
-extern int modules_disabled;
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-extern int sg_big_buff;
-#endif
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -152,10 +153,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-extern int acct_parm[];
-#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -163,10 +160,6 @@ extern int unaligned_dump_stack;
 extern struct ratelimit_state printk_ratelimit_state;
-#ifdef CONFIG_RT_MUTEXES
-extern int max_lock_depth;
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -205,9 +198,6 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
-extern int prove_locking;
-extern int lock_stat;
 /* The default sysctl tables: */
 static struct ctl_table root_table[] = {
@@ -1454,7 +1444,7 @@ static struct ctl_table fs_table[] = {
 };
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@ -1463,6 +1453,17 @@ static struct ctl_table debug_table[] = {
                .proc_handler   = proc_dointvec
        },
 #endif
+#if defined(CONFIG_OPTPROBES)
+        {
+                .procname       = "kprobes-optimization",
+                .data           = &sysctl_kprobes_optimization,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_kprobes_optimization_handler,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
        { }
 };
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..59030570f5ca 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/ctype.h>
 #include <linux/netdevice.h>
+#include <linux/slab.h>
 #ifdef CONFIG_SYSCTL_SYSCALL
@@ -1331,7 +1332,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        ssize_t result;
        char *pathname;
        int flags;
-        int acc_mode, fmode;
+        int acc_mode;
        pathname = sysctl_getname(name, nlen, &table);
        result = PTR_ERR(pathname);
@@ -1342,15 +1343,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        if (oldval && oldlen && newval && newlen) {
                flags = O_RDWR;
                acc_mode = MAY_READ | MAY_WRITE;
-                fmode = FMODE_READ | FMODE_WRITE;
        } else if (newval && newlen) {
                flags = O_WRONLY;
                acc_mode = MAY_WRITE;
-                fmode = FMODE_WRITE;
        } else if (oldval && oldlen) {
                flags = O_RDONLY;
                acc_mode = MAY_READ;
-                fmode = FMODE_READ;
        } else {
                result = 0;
                goto out_putname;
@@ -1361,7 +1359,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        if (result)
                goto out_putname;
-        result = may_open(&nd.path, acc_mode, fmode);
+        result = may_open(&nd.path, acc_mode, flags);
        if (result)
                goto out_putpath;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/cgroupstats.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
        .maxattr        = TASKSTATS_CMD_ATTR_MAX,
 };
-static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
-__read_mostly = {
        [TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
        [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
-static struct nla_policy
+static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
-cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
        [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..656dccfe1cbb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ptrace.h>
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&watchdog_lock, flags);
+        /*
+         * We use trylock here to avoid a potential dead lock when
+         * kgdb calls this code after the kernel has been stopped with
+         * watchdog_lock held. When watchdog_lock is held we just
+         * return and accept, that the watchdog might trigger and mark
+         * the monitored clock source (usually TSC) unstable.
+         *
+         * This does not affect the other caller clocksource_resume()
+         * because at this point the kernel is UP, interrupts are
+         * disabled and nothing can hold watchdog_lock.
+         */
+        if (!spin_trylock_irqsave(&watchdog_lock, flags))
+                return;
        clocksource_reset_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 /**
+ * clocksource_suspend - suspend the clocksource(s)
+ */
+void clocksource_suspend(void)
+{
+        struct clocksource *cs;
+        list_for_each_entry_reverse(cs, &clocksource_list, list)
+                if (cs->suspend)
+                        cs->suspend(cs);
+}
+/**
 * clocksource_resume - resume the clocksource(s)
 */
 void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
        list_for_each_entry(cs, &clocksource_list, list)
                if (cs->resume)
-                        cs->resume();
+                        cs->resume(cs);
        clocksource_resume_watchdog();
 }
@@ -458,8 +482,8 @@ void clocksource_resume(void)
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
+ * to incorrectly trip the watchdog. This might fail when the kernel
- *
+ * was stopped in code which holds watchdog_lock.
 */
 void clocksource_touch_watchdog(void)
 {
@@ -568,6 +592,10 @@ static inline void clocksource_select(void) { }
 */
 static int __init clocksource_done_booting(void)
 {
+        mutex_lock(&clocksource_mutex);
+        curr_clocksource = clocksource_default_clock();
+        mutex_unlock(&clocksource_mutex);
        finished_booting = 1;
        /*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64			time_offset;
 static long                     time_constant = 2;
 /* maximum error (usecs):                                               */
-long                            time_maxerror = NTP_PHASE_LIMIT;
+static long                     time_maxerror = NTP_PHASE_LIMIT;
 /* estimated error (usecs):                                             */
-long                            time_esterror = NTP_PHASE_LIMIT;
+static long                     time_esterror = NTP_PHASE_LIMIT;
 /* frequency offset (scaled nsecs/secs):                                */
 static s64                      time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
         * Select how the frequency is to be controlled
         * and in which mode (PLL or FLL).
         */
-        secs = xtime.tv_sec - time_reftime;
+        secs = get_seconds() - time_reftime;
        if (unlikely(time_status & STA_FREQHOLD))
                secs = 0;
-        time_reftime = xtime.tv_sec;
+        time_reftime = get_seconds();
        offset64    = offset;
        freq_adj    = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
         * reference time to current time.
         */
        if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
-                time_reftime = xtime.tv_sec;
+                time_reftime = get_seconds();
        /* only set allowed bits */
        time_status &= STA_RONLY;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
 #include "tick-internal.h"
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+        /* Nothing to do if we already reached the limit */
+        if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+                return -ETIME;
+        if (dev->min_delta_ns < 5000)
+                dev->min_delta_ns = 5000;
+        else
+                dev->min_delta_ns += dev->min_delta_ns >> 1;
+        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+                dev->min_delta_ns = MIN_DELTA_LIMIT;
+        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+               dev->name ? dev->name : "?",
+               (unsigned long long) dev->min_delta_ns);
+        return 0;
+}
 /**
 * tick_program_event internal worker function
 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
                if (!ret || !force)
                        return ret;
+                dev->retries++;
                /*
-                 * We tried 2 times to program the device with the given
+                 * We tried 3 times to program the device with the given
-                 * min_delta_ns. If that's not working then we double it
+                 * min_delta_ns. If that's not working then we increase it
                 * and emit a warning.
                 */
                if (++i > 2) {
                        /* Increase the min. delta and try again */
-                        if (!dev->min_delta_ns)
+                        if (tick_increase_min_delta(dev)) {
-                                dev->min_delta_ns = 5000;
+                                /*
-                        else
+                                 * Get out of the loop if min_delta_ns
-                                dev->min_delta_ns += dev->min_delta_ns >> 1;
+                                 * hit the limit already. That's
+                                 * better than staying here forever.
-                        printk(KERN_WARNING
+                                 *
-                               "CE: %s increasing min_delta_ns to %llu nsec\n",
+                                 * We clear next_event so we have a
-                               dev->name ? dev->name : "?",
+                                 * chance that the box survives.
-                               (unsigned long long) dev->min_delta_ns << 1);
+                                 */
+                                printk(KERN_WARNING
+                                       "CE: Reprogramming failure. Giving up\n");
+                                dev->next_event.tv64 = KTIME_MAX;
+                                return -ETIME;
+                        }
                        i = 0;
                }
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090be..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
 #include <linux/timecompare.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4f..39f6177fafac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -622,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
        write_sequnlock_irqrestore(&xtime_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+        clocksource_suspend();
        return 0;
 }
@@ -817,7 +818,8 @@ void update_wall_time(void)
        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
                offset = logarithmic_accumulation(offset, shift);
-                shift--;
+                if(offset < timekeeper.cycle_interval<<shift)
+                        shift--;
        }
        /* correct the clock when NTP error is too big */
@@ -880,6 +882,7 @@ void getboottime(struct timespec *ts)
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
+EXPORT_SYMBOL_GPL(getboottime);
 /**
 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,6 +892,7 @@ void monotonic_to_bootbased(struct timespec *ts)
 {
        *ts = timespec_add_safe(*ts, total_sleep_time);
 }
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..1a4a7dd78777 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        SEQ_printf(m, " event_handler:  ");
        print_name_offset(m, dev->event_handler);
        SEQ_printf(m, "\n");
+        SEQ_printf(m, " retries:        %lu\n", dev->retries);
 }
 static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.5\n");
+        SEQ_printf(m, "Timer List Version: v0.6\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..aeb6a54f2771 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -880,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
        if (base->running_timer == timer)
                goto out;
+        timer_stats_timer_clear_start_info(timer);
        ret = 0;
        if (timer_pending(timer)) {
                detach_timer(timer, 1);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f289..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
 config HAVE_FUNCTION_GRAPH_FP_TEST
        bool
        help
-         An arch may pass in a unique value (frame pointer) to both the
+          See Documentation/trace/ftrace-design.txt
-         entering and exiting of a function. On exit, the value is compared
-         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
@@ -330,15 +328,6 @@ config BRANCH_TRACER
          Say N if unsure.
-config POWER_TRACER
-        bool "Trace power consumption behavior"
-        depends on X86
-        select GENERIC_TRACER
-        help
-          This tracer helps developers to analyze and optimize the kernel's
-          power management decisions, specifically the C-state and P-state
-          behavior.
 config KSYM_TRACER
        bool "Trace read and write access on kernel memory locations"
        depends on HAVE_HW_BREAKPOINT
@@ -451,7 +440,7 @@ config BLK_DEV_IO_TRACE
 config KPROBE_EVENT
        depends on KPROBES
-        depends on X86
+        depends on HAVE_REGS_AND_STACK_ACCESS_API
        bool "Enable kprobes-based dynamic events"
        select TRACING
        default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe23f54..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
-obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..b3bc91a3f510 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/smp_lock.h>
 #include <linux/time.h>
@@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (ret)
                return ret;
-        if (copy_to_user(arg, &buts, sizeof(buts)))
+        if (copy_to_user(arg, &buts, sizeof(buts))) {
+                blk_trace_remove(q);
                return -EFAULT;
+        }
        return 0;
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1904797f4a8a..2404b59b3097 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -24,9 +24,11 @@
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/hash.h>
+#include <linux/rcupdate.h>
 #include <trace/events/sched.h>
@@ -84,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
-static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+ * Traverse the ftrace_list, invoking all entries.  The reason that we
-#endif
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
-        struct ftrace_ops *op = ftrace_list;
+        struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
-        /* in case someone actually ports this to alpha! */
-        read_barrier_depends();
        while (op != &ftrace_list_end) {
-                /* silly alpha */
-                read_barrier_depends();
                op->func(ip, parent_ip);
-                op = op->next;
+                op = rcu_dereference_raw(op->next); /*see above*/
        };
 }
@@ -154,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
         * the ops->next pointer is valid before another CPU sees
         * the ops pointer included into the ftrace_list.
         */
-        smp_wmb();
+        rcu_assign_pointer(ftrace_list, ops);
-        ftrace_list = ops;
        if (ftrace_enabled) {
                ftrace_func_t func;
@@ -2276,6 +2277,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
 static int __init set_graph_function(char *str)
 {
        strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -2402,6 +2405,7 @@ static const struct file_operations ftrace_notrace_fops = {
 static DEFINE_MUTEX(graph_lock);
 int ftrace_graph_count;
+int ftrace_graph_filter_enabled;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 static void *
@@ -2424,7 +2428,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
        mutex_lock(&graph_lock);
        /* Nothing, tell g_show to print all functions are enabled */
-        if (!ftrace_graph_count && !*pos)
+        if (!ftrace_graph_filter_enabled && !*pos)
                return (void *)1;
        return __g_next(m, pos);
@@ -2470,6 +2474,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
        mutex_lock(&graph_lock);
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC)) {
+                ftrace_graph_filter_enabled = 0;
                ftrace_graph_count = 0;
                memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
        }
@@ -2495,7 +2500,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
        int search_len;
-        int found = 0;
+        int fail = 1;
        int type, not;
        char *search;
        bool exists;
@@ -2506,37 +2511,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        /* decode regex */
        type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
-        if (not)
+        if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
-                return -EINVAL;
+                return -EBUSY;
        search_len = strlen(search);
        mutex_lock(&ftrace_lock);
        do_for_each_ftrace_rec(pg, rec) {
-                if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
-                        break;
                if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
                        continue;
                if (ftrace_match_record(rec, search, search_len, type)) {
-                        /* ensure it is not already in the array */
+                        /* if it is in the array */
                        exists = false;
-                        for (i = 0; i < *idx; i++)
+                        for (i = 0; i < *idx; i++) {
                                if (array[i] == rec->ip) {
                                        exists = true;
                                        break;
                                }
-                        if (!exists)
+                        }
-                                array[(*idx)++] = rec->ip;
-                        found = 1;
+                        if (!not) {
+                                fail = 0;
+                                if (!exists) {
+                                        array[(*idx)++] = rec->ip;
+                                        if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+                                                goto out;
+                                }
+                        } else {
+                                if (exists) {
+                                        array[i] = array[--(*idx)];
+                                        array[*idx] = 0;
+                                        fail = 0;
+                                }
+                        }
                }
        } while_for_each_ftrace_rec();
+out:
        mutex_unlock(&ftrace_lock);
-        return found ? 0 : -EINVAL;
+        if (fail)
+                return -EINVAL;
+        ftrace_graph_filter_enabled = 1;
+        return 0;
 }
 static ssize_t
@@ -2546,16 +2565,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
        struct trace_parser parser;
        ssize_t read, ret;
-        if (!cnt || cnt < 0)
+        if (!cnt)
                return 0;
        mutex_lock(&graph_lock);
-        if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
-                ret = -EBUSY;
-                goto out_unlock;
-        }
        if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
                ret = -ENOMEM;
                goto out_unlock;
@@ -3340,6 +3354,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 {
        /* Make sure we do not use the parent ret_stack */
        t->ret_stack = NULL;
+        t->curr_ret_stack = -1;
        if (ftrace_graph_active) {
                struct ftrace_ret_stack *ret_stack;
@@ -3349,7 +3364,6 @@ void ftrace_graph_init_task(struct task_struct *t)
                                GFP_KERNEL);
                if (!ret_stack)
                        return;
-                t->curr_ret_stack = -1;
                atomic_set(&t->tracing_graph_pause, 0);
                atomic_set(&t->trace_overrun, 0);
                t->ftrace_timestamp = 0;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e6..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801b..41ca394feb22 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
+#include <asm/local.h>
 #include "trace.h"
 /*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+# define RB_FORCE_8BYTE_ALIGNMENT       0
+# define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT       1
+# define RB_ARCH_ALIGNMENT              8U
+#endif
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -464,6 +474,8 @@ struct ring_buffer_iter {
        struct ring_buffer_per_cpu      *cpu_buffer;
        unsigned long                   head;
        struct buffer_page              *head_page;
+        struct buffer_page              *cache_reader_page;
+        unsigned long                   cache_read;
        u64                             read_stamp;
 };
@@ -1198,18 +1210,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                        return;
+                        goto out;
                p = cpu_buffer->pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
                free_buffer_page(bpage);
        }
        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                return;
+                goto out;
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
+out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
@@ -1226,7 +1239,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
-                        return;
+                        goto out;
                p = pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
@@ -1235,6 +1248,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
+out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
@@ -1544,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event,
        case 0:
                length -= RB_EVNT_HDR_SIZE;
-                if (length > RB_MAX_SMALL_DATA)
+                if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                        event->array[0] = length;
                else
                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1719,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length)
        if (!length)
                length = 1;
-        if (length > RB_MAX_SMALL_DATA)
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                length += sizeof(event.array[0]);
        length += RB_EVNT_HDR_SIZE;
-        length = ALIGN(length, RB_ALIGNMENT);
+        length = ALIGN(length, RB_ARCH_ALIGNMENT);
        return length;
 }
@@ -2230,12 +2244,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return NULL;
-        if (atomic_read(&buffer->record_disabled))
-                return NULL;
        /* If we are tracing schedule, we don't want to recurse */
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out_nocheck;
        if (trace_recursive_lock())
                goto out_nocheck;
@@ -2467,11 +2481,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return -EBUSY;
-        if (atomic_read(&buffer->record_disabled))
-                return -EBUSY;
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out;
        cpu = raw_smp_processor_id();
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2539,7 +2553,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
 * @buffer: The ring buffer to enable writes
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable(struct ring_buffer *buffer)
 {
@@ -2575,7 +2589,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 * @cpu: The CPU to enable.
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
@@ -2716,6 +2730,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
+        iter->cache_reader_page = cpu_buffer->reader_page;
+        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -3060,13 +3076,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_event *event;
        int nr_loops = 0;
-        if (ring_buffer_iter_empty(iter))
-                return NULL;
        cpu_buffer = iter->cpu_buffer;
        buffer = cpu_buffer->buffer;
+        /*
+         * Check if someone performed a consuming read to
+         * the buffer. A consuming read invalidates the iterator
+         * and we need to reset the iterator in this case.
+         */
+        if (unlikely(iter->cache_read != cpu_buffer->read ||
+                     iter->cache_reader_page != cpu_buffer->reader_page))
+                rb_iter_reset(iter);
 again:
+        if (ring_buffer_iter_empty(iter))
+                return NULL;
        /*
         * We repeat when a timestamp is encountered.
         * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3106,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
+        if (iter->head >= local_read(&iter->head_page->page->commit)) {
+                rb_inc_iter(iter);
+                goto again;
+        }
        event = rb_iter_head_event(iter);
        switch (event->type_len) {
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/time.h>
+#include <asm/local.h>
 struct rb_page {
        u64             ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9e..44f916a04065 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,10 +32,11 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-#include <linux/gfp.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
 static inline void ftrace_disable_cpu(void)
 {
        preempt_disable();
-        __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_inc(ftrace_cpu_disabled);
 }
 static inline void ftrace_enable_cpu(void)
 {
-        __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_dec(ftrace_cpu_disabled);
        preempt_enable();
 }
 static cpumask_var_t __read_mostly      tracing_buffer_mask;
-/* Define which cpu buffers are currently read in trace_pipe */
-static cpumask_var_t                    tracing_reader_cpumask;
 #define for_each_tracing_cpu(cpu)       \
        for_each_cpu(cpu, tracing_buffer_mask)
@@ -243,12 +241,91 @@ static struct tracer		*current_trace __read_mostly;
 /*
 * trace_types_lock is used to protect the trace_types list.
- * This lock is also used to keep user access serialized.
- * Accesses from userspace will grab this lock while userspace
- * activities happen inside the kernel.
 */
 static DEFINE_MUTEX(trace_types_lock);
+/*
+ * serialize the access of the ring buffer
+ *
+ * ring buffer serializes readers, but it is low level protection.
+ * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ * are not protected by ring buffer.
+ *
+ * The content of events may become garbage if we allow other process consumes
+ * these events concurrently:
+ *   A) the page of the consumed events may become a normal page
+ *      (not reader page) in ring buffer, and this page will be rewrited
+ *      by events producer.
+ *   B) The page of the consumed events may become a page for splice_read,
+ *      and this page will be returned to system.
+ *
+ * These primitives allow multi process access to different cpu ring buffer
+ * concurrently.
+ *
+ * These primitives don't distinguish read-only and read-consume access.
+ * Multi read-only access are also serialized.
+ */
+#ifdef CONFIG_SMP
+static DECLARE_RWSEM(all_cpu_access_lock);
+static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                /* gain it for accessing the whole ring buffer. */
+                down_write(&all_cpu_access_lock);
+        } else {
+                /* gain it for accessing a cpu ring buffer. */
+                /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+                down_read(&all_cpu_access_lock);
+                /* Secondly block other access to this @cpu ring buffer. */
+                mutex_lock(&per_cpu(cpu_access_lock, cpu));
+        }
+}
+static inline void trace_access_unlock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                up_write(&all_cpu_access_lock);
+        } else {
+                mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+                up_read(&all_cpu_access_lock);
+        }
+}
+static inline void trace_access_lock_init(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                mutex_init(&per_cpu(cpu_access_lock, cpu));
+}
+#else
+static DEFINE_MUTEX(access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        (void)cpu;
+        mutex_lock(&access_lock);
+}
+static inline void trace_access_unlock(int cpu)
+{
+        (void)cpu;
+        mutex_unlock(&access_lock);
+}
+static inline void trace_access_lock_init(void)
+{
+}
+#endif
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
@@ -297,6 +374,21 @@ static int __init set_buf_size(char *str)
 }
 __setup("trace_buf_size=", set_buf_size);
+static int __init set_tracing_thresh(char *str)
+{
+        unsigned long threshhold;
+        int ret;
+        if (!str)
+                return 0;
+        ret = strict_strtoul(str, 0, &threshhold);
+        if (ret < 0)
+                return 0;
+        tracing_thresh = threshhold * 1000;
+        return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
 unsigned long nsecs_to_usecs(unsigned long nsecs)
 {
        return nsecs / 1000;
@@ -502,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 static arch_spinlock_t ftrace_max_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+unsigned long __read_mostly     tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 unsigned long __read_mostly     tracing_max_latency;
-unsigned long __read_mostly     tracing_thresh;
 /*
 * Copy the new maximum trace into the separate maximum-trace
@@ -515,7 +608,7 @@ static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
        struct trace_array_cpu *data = tr->data[cpu];
-        struct trace_array_cpu *max_data = tr->data[cpu];
+        struct trace_array_cpu *max_data;
        max_tr.cpu = cpu;
        max_tr.time_start = data->preempt_timestamp;
@@ -525,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;
-        memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+        memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
        max_data->pid = tsk->pid;
        max_data->uid = task_uid(tsk);
        max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -747,10 +840,10 @@ out:
        mutex_unlock(&trace_types_lock);
 }
-static void __tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
 {
        ftrace_disable_cpu();
-        ring_buffer_reset_cpu(tr->buffer, cpu);
+        ring_buffer_reset_cpu(buffer, cpu);
        ftrace_enable_cpu();
 }
@@ -762,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
        /* Make sure all commits have finished */
        synchronize_sched();
-        __tracing_reset(tr, cpu);
+        __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -780,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        tr->time_start = ftrace_now(tr->cpu);
        for_each_online_cpu(cpu)
-                __tracing_reset(tr, cpu);
+                __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -857,6 +950,8 @@ void tracing_start(void)
                goto out;
        }
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
@@ -866,6 +961,8 @@ void tracing_start(void)
        if (buffer)
                ring_buffer_record_enable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
        ftrace_start();
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -887,6 +984,9 @@ void tracing_stop(void)
        if (trace_stop_count++)
                goto out;
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
@@ -895,6 +995,8 @@ void tracing_stop(void)
        if (buffer)
                ring_buffer_record_disable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
@@ -951,6 +1053,11 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
+        if (WARN_ON_ONCE(pid < 0)) {
+                strcpy(comm, "<XXX>");
+                return;
+        }
        if (pid > PID_MAX_DEFAULT) {
                strcpy(comm, "<...>");
                return;
@@ -1084,7 +1191,7 @@ trace_function(struct trace_array *tr,
        struct ftrace_entry *entry;
        /* If we are reading the ring buffer, don't trace */
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1177,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
                return;
+        /*
+         * NMIs can not handle page faults, even with fix ups.
+         * The save user stack can (and often does) fault.
+         */
+        if (unlikely(in_nmi()))
+                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
@@ -1315,8 +1429,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        entry->fmt                      = fmt;
        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, flags, 6, pc);
+        }
 out_unlock:
        arch_spin_unlock(&trace_buf_lock);
@@ -1389,8 +1505,10 @@ int trace_array_vprintk(struct trace_array *tr,
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = '\0';
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, irq_flags, 6, pc);
+        }
 out_unlock:
        arch_spin_unlock(&trace_buf_lock);
@@ -1580,12 +1698,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 }
 /*
- * No necessary locking here. The worst thing which can
- * happen is loosing events consumed at the same time
- * by a trace_pipe reader.
- * Other than that, we don't risk to crash the ring buffer
- * because it serializes the readers.
- *
 * The current tracer is copied to avoid a global locking
 * all around.
 */
@@ -1623,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                ftrace_enable_cpu();
+                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;
@@ -1640,12 +1753,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
        }
        trace_event_read_lock();
+        trace_access_lock(cpu_file);
        return p;
 }
 static void s_stop(struct seq_file *m, void *p)
 {
+        struct trace_iterator *iter = m->private;
        atomic_dec(&trace_record_cmdline_disabled);
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
 }
@@ -2836,22 +2953,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        mutex_lock(&trace_types_lock);
-        /* We only allow one reader per cpu */
-        if (cpu_file == TRACE_PIPE_ALL_CPU) {
-                if (!cpumask_empty(tracing_reader_cpumask)) {
-                        ret = -EBUSY;
-                        goto out;
-                }
-                cpumask_setall(tracing_reader_cpumask);
-        } else {
-                if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
-                        cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
-                else {
-                        ret = -EBUSY;
-                        goto out;
-                }
-        }
        /* create a buffer to store the information to pass to userspace */
        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
@@ -2907,12 +3008,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        mutex_lock(&trace_types_lock);
-        if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
-                cpumask_clear(tracing_reader_cpumask);
-        else
-                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
        if (iter->trace->pipe_close)
                iter->trace->pipe_close(iter);
@@ -3074,6 +3169,7 @@ waitagain:
        iter->pos = -1;
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        while (find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int len = iter->seq.len;
@@ -3090,6 +3186,7 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        /* Now copy what we have to the user */
@@ -3215,6 +3312,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        }
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        /* Fill as many pages as possible. */
        for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3238,6 +3336,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                trace_seq_init(&iter->seq);
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        mutex_unlock(&iter->mutex);
@@ -3539,10 +3638,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        info->read = 0;
+        trace_access_lock(info->cpu);
        ret = ring_buffer_read_page(info->tr->buffer,
                                    &info->spare,
                                    count,
                                    info->cpu, 0);
+        trace_access_unlock(info->cpu);
        if (ret < 0)
                return 0;
@@ -3670,6 +3771,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
+        trace_access_lock(info->cpu);
        entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3717,6 +3819,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        }
+        trace_access_unlock(info->cpu);
        spd.nr_pages = i;
        /* did we read anything? */
@@ -4153,6 +4256,8 @@ static __init int tracer_init_debugfs(void)
        struct dentry *d_tracer;
        int cpu;
+        trace_access_lock_init();
        d_tracer = tracing_init_dentry();
        trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
        trace_create_file("tracing_max_latency", 0644, d_tracer,
                        &tracing_max_latency, &tracing_max_lat_fops);
+#endif
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &tracing_thresh, &tracing_max_lat_fops);
-#endif
        trace_create_file("README", 0444, d_tracer,
                        NULL, &tracing_readme_fops);
@@ -4387,9 +4492,6 @@ __init static int tracer_alloc_buffers(void)
        if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;
-        if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
-                goto out_free_tracing_cpumask;
        /* To save memory, keep the ring buffer size to its minimum */
        if (ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
@@ -4447,8 +4549,6 @@ __init static int tracer_alloc_buffers(void)
        return 0;
 out_free_cpumask:
-        free_cpumask_var(tracing_reader_cpumask);
-out_free_tracing_cpumask:
        free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+extern unsigned long tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 extern unsigned long tracing_max_latency;
-extern unsigned long tracing_thresh;
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
@@ -497,6 +498,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 #define FTRACE_GRAPH_MAX_FUNCS          32
+extern int ftrace_graph_filter_enabled;
 extern int ftrace_graph_count;
 extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
@@ -504,7 +506,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 {
        int i;
-        if (!ftrace_graph_count || test_tsk_trace_graph(current))
+        if (!ftrace_graph_filter_enabled)
                return 1;
        for (i = 0; i < ftrace_graph_count; i++) {
@@ -549,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
 * @buffer: holds the parsed user input
- * @idx: user input lenght
+ * @idx: user input length
 * @size: buffer size
 */
 struct trace_parser {
@@ -791,7 +793,8 @@ extern const char *__stop___trace_bprintk_fmt[];
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print)             \
-        extern struct ftrace_event_call event_##call;
+        extern struct ftrace_event_call                                 \
+        __attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)         \
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
                return -1;
        if (percent_a > percent_b)
                return 1;
-        else
-                return 0;
+        if (a->incorrect < b->incorrect)
+                return -1;
+        if (a->incorrect > b->incorrect)
+                return 1;
+        /*
+         * Since the above shows worse (incorrect) cases
+         * first, we continue that by showing best (correct)
+         * cases last.
+         */
+        if (a->correct > b->correct)
+                return -1;
+        if (a->correct < b->correct)
+                return 1;
+        return 0;
 }
 static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
 * Tracer plugins will chose a default from these clocks.
 */
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void)
        int this_cpu;
        u64 now;
-        raw_local_irq_save(flags);
+        local_irq_save(flags);
        this_cpu = raw_smp_processor_id();
        now = cpu_clock(this_cpu);
@@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void)
        arch_spin_unlock(&trace_clock_struct.lock);
 out:
-        raw_local_irq_restore(flags);
+        local_irq_restore(flags);
        return now;
 }
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c
index f0d693005075..0565bb42566f 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,32 +1,41 @@
 /*
- * trace event based perf counter profiling
+ * trace event based perf event profiling/tracing
 *
 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
 */
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include "trace.h"
+DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
+EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
-typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+        perf_trace_t;
 /* Count the events in use (per event id, not per instance) */
-static int      total_profile_count;
+static int      total_ref_count;
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+static int perf_trace_event_enable(struct ftrace_event_call *event)
 {
        char *buf;
        int ret = -ENOMEM;
-        if (event->profile_count++ > 0)
+        if (event->perf_refcount++ > 0)
                return 0;
-        if (!total_profile_count) {
+        if (!total_ref_count) {
                buf = (char *)alloc_percpu(perf_trace_t);
                if (!buf)
                        goto fail_buf;
@@ -40,35 +49,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
                rcu_assign_pointer(perf_trace_buf_nmi, buf);
        }
-        ret = event->profile_enable(event);
+        ret = event->perf_event_enable(event);
        if (!ret) {
-                total_profile_count++;
+                total_ref_count++;
                return 0;
        }
 fail_buf_nmi:
-        if (!total_profile_count) {
+        if (!total_ref_count) {
                free_percpu(perf_trace_buf_nmi);
                free_percpu(perf_trace_buf);
                perf_trace_buf_nmi = NULL;
                perf_trace_buf = NULL;
        }
 fail_buf:
-        event->profile_count--;
+        event->perf_refcount--;
        return ret;
 }
-int ftrace_profile_enable(int event_id)
+int perf_trace_enable(int event_id)
 {
        struct ftrace_event_call *event;
        int ret = -EINVAL;
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id && event->profile_enable &&
+                if (event->id == event_id && event->perf_event_enable &&
                    try_module_get(event->mod)) {
-                        ret = ftrace_profile_enable_event(event);
+                        ret = perf_trace_event_enable(event);
                        break;
                }
        }
@@ -77,16 +86,16 @@ int ftrace_profile_enable(int event_id)
        return ret;
 }
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+static void perf_trace_event_disable(struct ftrace_event_call *event)
 {
        char *buf, *nmi_buf;
-        if (--event->profile_count > 0)
+        if (--event->perf_refcount > 0)
                return;
-        event->profile_disable(event);
+        event->perf_event_disable(event);
-        if (!--total_profile_count) {
+        if (!--total_ref_count) {
                buf = perf_trace_buf;
                rcu_assign_pointer(perf_trace_buf, NULL);
@@ -104,14 +113,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
        }
 }
-void ftrace_profile_disable(int event_id)
+void perf_trace_disable(int event_id)
 {
        struct ftrace_event_call *event;
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
                if (event->id == event_id) {
-                        ftrace_profile_disable_event(event);
+                        perf_trace_event_disable(event);
                        module_put(event->mod);
                        break;
                }
@@ -119,13 +128,15 @@ void ftrace_profile_disable(int event_id)
        mutex_unlock(&event_mutex);
 }
-__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
-                                        int *rctxp, unsigned long *irq_flags)
+                                       int *rctxp, unsigned long *irq_flags)
 {
        struct trace_entry *entry;
        char *trace_buf, *raw_data;
        int pc, cpu;
+        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
        pc = preempt_count();
        /* Protect the per cpu buffer, begin the rcu read side */
@@ -138,9 +149,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
        cpu = smp_processor_id();
        if (in_nmi())
-                trace_buf = rcu_dereference(perf_trace_buf_nmi);
+                trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
        else
-                trace_buf = rcu_dereference(perf_trace_buf);
+                trace_buf = rcu_dereference_sched(perf_trace_buf);
        if (!trace_buf)
                goto err;
@@ -148,7 +159,7 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
        raw_data = per_cpu_ptr(trace_buf, cpu);
        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
        entry = (struct trace_entry *)raw_data;
        tracing_generic_entry_update(entry, *irq_flags, pc);
@@ -161,4 +172,4 @@ err_recursion:
        local_irq_restore(*irq_flags);
        return NULL;
 }
-EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..c697c7043349 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <linux/delay.h>
 #include <asm/setup.h>
@@ -60,10 +61,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
        return 0;
 err:
-        if (field) {
+        if (field)
                kfree(field->name);
-                kfree(field->type);
-        }
        kfree(field);
        return -ENOMEM;
@@ -520,41 +519,16 @@ out:
        return ret;
 }
-extern char *__bad_type_size(void);
-#undef FIELD
-#define FIELD(type, name)                                               \
-        sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
-        #type, "common_" #name, offsetof(typeof(field), name),          \
-                sizeof(field.name), is_signed_type(type)
-static int trace_write_header(struct trace_seq *s)
-{
-        struct trace_entry field;
-        /* struct trace_entry */
-        return trace_seq_printf(s,
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\n",
-                        FIELD(unsigned short, type),
-                        FIELD(unsigned char, flags),
-                        FIELD(unsigned char, preempt_count),
-                        FIELD(int, pid),
-                        FIELD(int, lock_depth));
-}
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_field *field;
        struct trace_seq *s;
+        int common_field_count = 5;
        char *buf;
-        int r;
+        int r = 0;
        if (*ppos)
                return 0;
@@ -565,14 +539,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
-        /* If any of the first writes fail, so will the show_format. */
        trace_seq_printf(s, "name: %s\n", call->name);
        trace_seq_printf(s, "ID: %d\n", call->id);
        trace_seq_printf(s, "format:\n");
-        trace_write_header(s);
-        r = call->show_format(call, s);
+        list_for_each_entry_reverse(field, &call->fields, link) {
+                /*
+                 * Smartly shows the array type(except dynamic array).
+                 * Normal:
+                 *      field:TYPE VAR
+                 * If TYPE := TYPE[LEN], it is shown:
+                 *      field:TYPE VAR[LEN]
+                 */
+                const char *array_descriptor = strchr(field->type, '[');
+                if (!strncmp(field->type, "__data_loc", 10))
+                        array_descriptor = NULL;
+                if (!array_descriptor) {
+                        r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        field->type, field->name, field->offset,
+                                        field->size, !!field->is_signed);
+                } else {
+                        r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        (int)(array_descriptor - field->type),
+                                        field->type, field->name,
+                                        array_descriptor, field->offset,
+                                        field->size, !!field->is_signed);
+                }
+                if (--common_field_count == 0)
+                        r = trace_seq_printf(s, "\n");
+                if (!r)
+                        break;
+        }
+        if (r)
+                r = trace_seq_printf(s, "\nprint fmt: %s\n",
+                                call->print_fmt);
        if (!r) {
                /*
                 * ug!  The format output is bigger than a PAGE!!
@@ -931,7 +939,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
-        if (call->id && call->profile_enable)
+        if (call->id && call->perf_event_enable)
                trace_create_file("id", 0444, call->dir, call,
                                  id);
@@ -948,10 +956,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                                  filter);
        }
-        /* A trace may not want to export its format */
-        if (!call->show_format)
-                return 0;
        trace_create_file("format", 0444, call->dir, call,
                          format);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4615f62a04f1..88c0b6dbd7fe 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/mutex.h>
 #include <linux/perf_event.h>
+#include <linux/slab.h>
 #include "trace.h"
 #include "trace_output.h"
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #include "trace_entries.h"
-#undef __field
-#define __field(type, item)                                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), item),           \
-                               sizeof(field.item), is_signed_type(type)); \
-        if (!ret)                                                       \
-                return 0;
-#undef __field_desc
-#define __field_desc(type, container, item)                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item),            \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef __array
-#define __array(type, item, len)                                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), item),           \
-                               sizeof(field.item), is_signed_type(type)); \
-        if (!ret)                                                       \
-                return 0;
-#undef __array_desc
-#define __array_desc(type, container, item, len)                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item),            \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef __dynamic_array
-#define __dynamic_array(type, item)                                     \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:0;\tsigned:%u;\n",    \
-                               offsetof(typeof(field), item),           \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef F_printk
-#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-#undef __entry
-#define __entry REC
-#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
-static int                                                              \
-ftrace_format_##name(struct ftrace_event_call *unused,                  \
-                     struct trace_seq *s)                               \
-{                                                                       \
-        struct struct_name field __attribute__((unused));               \
-        int ret = 0;                                                    \
-                                                                        \
-        tstruct;                                                        \
-                                                                        \
-        trace_seq_printf(s, "\nprint fmt: " print);                     \
-                                                                        \
-        return ret;                                                     \
-}
-#include "trace_entries.h"
 #undef __field
 #define __field(type, item)                                             \
        ret = trace_define_field(event_call, #type, #item,              \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
                return ret;
 #undef __dynamic_array
-#define __dynamic_array(type, item)
+#define __dynamic_array(type, item)                                     \
+        ret = trace_define_field(event_call, #type, #item,              \
+                                 offsetof(typeof(field), item),         \
+                                 0, is_signed_type(type), FILTER_OTHER);\
+        if (ret)                                                        \
+                return ret;
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
        return 0;
 }
+#undef __entry
+#define __entry REC
 #undef __field
 #define __field(type, item)
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 #undef __dynamic_array
 #define __dynamic_array(type, item)
+#undef F_printk
+#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, type, tstruct, print)           \
                                                                        \
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
        .id                     = type,                                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .raw_init               = ftrace_raw_init_event,                \
-        .show_format            = ftrace_format_##call,                 \
+        .print_fmt              = print,                                \
        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                      \
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..9aed1a5cf553 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -18,6 +19,7 @@ struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
        int             ignore;
+        unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 struct fgraph_data {
@@ -187,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ent_entry *entry;
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return 0;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        int cpu;
        int pc;
-        if (unlikely(!tr))
-                return 0;
        if (!ftrace_trace_task(current))
                return 0;
-        if (!ftrace_graph_addr(trace->func))
+        /* trace it when it is-nested-in or is a function enabled. */
+        if (!(trace->depth || ftrace_graph_addr(trace->func)))
                return 0;
        local_irq_save(flags);
@@ -231,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        } else {
                ret = 0;
        }
-        /* Only do the atomic if it is not already set */
-        if (!test_tsk_trace_graph(current))
-                set_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
@@ -241,6 +238,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        return ret;
 }
+int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+        if (tracing_thresh)
+                return 1;
+        else
+                return trace_graph_entry(trace);
+}
 static void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
@@ -251,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ret_entry *entry;
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
                pc = preempt_count();
                __trace_graph_return(tr, trace, flags, pc);
        }
-        if (!trace->depth)
-                clear_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
 }
+void set_graph_array(struct trace_array *tr)
+{
+        graph_array = tr;
+        /* Make graph_array visible before we start tracing */
+        smp_mb();
+}
+void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+        if (tracing_thresh &&
+            (trace->rettime - trace->calltime < tracing_thresh))
+                return;
+        else
+                trace_graph_return(trace);
+}
 static int graph_trace_init(struct trace_array *tr)
 {
        int ret;
-        graph_array = tr;
+        set_graph_array(tr);
-        ret = register_ftrace_graph(&trace_graph_return,
+        if (tracing_thresh)
-                                    &trace_graph_entry);
+                ret = register_ftrace_graph(&trace_graph_thresh_return,
+                                            &trace_graph_thresh_entry);
+        else
+                ret = register_ftrace_graph(&trace_graph_return,
+                                            &trace_graph_entry);
        if (ret)
                return ret;
        tracing_start_cmdline_record();
@@ -301,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr)
        return 0;
 }
-void set_graph_array(struct trace_array *tr)
-{
-        graph_array = tr;
-}
 static void graph_trace_reset(struct trace_array *tr)
 {
        tracing_stop_cmdline_record();
@@ -673,15 +693,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        duration = graph_ret->rettime - graph_ret->calltime;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. Since
                 * this is a leaf function, keep the comments
                 * equal to this depth.
                 */
-                *depth = call->depth - 1;
+                cpu_data->depth = call->depth - 1;
+                /* No need to keep this function around for this depth */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = 0;
        }
        /* Overhead */
@@ -721,10 +747,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
-                *depth = call->depth;
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+                cpu_data->depth = call->depth;
+                /* Save this function pointer to see if the exit matches */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = call->func;
        }
        /* No overhead */
@@ -854,19 +885,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        struct fgraph_data *data = iter->private;
        pid_t pid = ent->pid;
        int cpu = iter->cpu;
+        int func_match = 1;
        int ret;
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. This is the
                 * return from a function, we now want the comments
                 * to display at the same level of the bracket.
                 */
-                *depth = trace->depth - 1;
+                cpu_data->depth = trace->depth - 1;
+                if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+                        if (cpu_data->enter_funcs[trace->depth] != trace->func)
+                                func_match = 0;
+                        cpu_data->enter_funcs[trace->depth] = 0;
+                }
        }
        if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +931,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
-        ret = trace_seq_printf(s, "}\n");
+        /*
-        if (!ret)
+         * If the return function does not have a matching entry,
-                return TRACE_TYPE_PARTIAL_LINE;
+         * then the entry was lost. Instead of just printing
+         * the '}' and letting the user guess what function this
+         * belongs to, write out the function name.
+         */
+        if (func_match) {
+                ret = trace_seq_printf(s, "}\n");
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        } else {
+                ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        }
        /* Overrun */
        if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6178abf3637e..1251e367bae9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -635,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
                        event = strchr(group, '/') + 1;
                        event[-1] = '\0';
                        if (strlen(group) == 0) {
-                                pr_info("Group name is not specifiled\n");
+                                pr_info("Group name is not specified\n");
                                return -EINVAL;
                        }
                }
                if (strlen(event) == 0) {
-                        pr_info("Event name is not specifiled\n");
+                        pr_info("Event name is not specified\n");
                        return -EINVAL;
                }
        }
@@ -673,7 +673,7 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                /* an address specified */
-                ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+                ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
                        pr_info("Failed to parse address.\n");
                        return ret;
@@ -1155,86 +1155,66 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
        return 0;
 }
-static int __probe_event_show_format(struct trace_seq *s,
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
-                                     struct trace_probe *tp, const char *fmt,
-                                     const char *arg)
 {
        int i;
+        int pos = 0;
-        /* Show format */
+        const char *fmt, *arg;
-        if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
-                return 0;
-        for (i = 0; i < tp->nr_args; i++)
+        if (!probe_is_return(tp)) {
-                if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
+                fmt = "(%lx)";
-                        return 0;
+                arg = "REC->" FIELD_STRING_IP;
+        } else {
+                fmt = "(%lx <- %lx)";
+                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+        }
-        if (!trace_seq_printf(s, "\", %s", arg))
+        /* When len=0, we just calculate the needed length */
-                return 0;
+#define LEN_OR_ZERO (len ? len - pos : 0)
-        for (i = 0; i < tp->nr_args; i++)
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
-                if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
-                        return 0;
-        return trace_seq_puts(s, "\n");
+        for (i = 0; i < tp->nr_args; i++) {
-}
+                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
+                                tp->args[i].name);
+        }
-#undef SHOW_FIELD
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
-#define SHOW_FIELD(type, item, name)                                    \
-        do {                                                            \
-                ret = trace_seq_printf(s, "\tfield:" #type " %s;\t"     \
-                                "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
-                                (unsigned int)offsetof(typeof(field), item),\
-                                (unsigned int)sizeof(type),             \
-                                is_signed_type(type));                  \
-                if (!ret)                                               \
-                        return 0;                                       \
-        } while (0)
-static int kprobe_event_show_format(struct ftrace_event_call *call,
+        for (i = 0; i < tp->nr_args; i++) {
-                                    struct trace_seq *s)
+                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
-{
+                                tp->args[i].name);
-        struct kprobe_trace_entry field __attribute__((unused));
+        }
-        int ret, i;
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
-        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
-        /* Show fields */
+#undef LEN_OR_ZERO
-        for (i = 0; i < tp->nr_args; i++)
-                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
-        trace_seq_puts(s, "\n");
-        return __probe_event_show_format(s, tp, "(%lx)",
+        /* return the length of print_fmt */
-                                         "REC->" FIELD_STRING_IP);
+        return pos;
 }
-static int kretprobe_event_show_format(struct ftrace_event_call *call,
+static int set_print_fmt(struct trace_probe *tp)
-                                       struct trace_seq *s)
 {
-        struct kretprobe_trace_entry field __attribute__((unused));
+        int len;
-        int ret, i;
+        char *print_fmt;
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+        /* First: called with 0 length to calculate the needed length */
-        SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+        len = __set_print_fmt(tp, NULL, 0);
-        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
-        /* Show fields */
+        /* Second: actually write the @print_fmt */
-        for (i = 0; i < tp->nr_args; i++)
+        __set_print_fmt(tp, print_fmt, len + 1);
-                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+        tp->call.print_fmt = print_fmt;
-        trace_seq_puts(s, "\n");
-        return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+        return 0;
-                                         "REC->" FIELD_STRING_FUNC
-                                         ", REC->" FIELD_STRING_RETIP);
 }
 #ifdef CONFIG_PERF_EVENTS
 /* Kprobe profile handler */
-static __kprobes void kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
                                         struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1247,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
        __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
                     "profile buffer not large enough"))
                return;
-        entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
        if (!entry)
                return;
@@ -1260,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
        for (i = 0; i < tp->nr_args; i++)
                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-        ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
 }
 /* Kretprobe profile handler */
-static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
                                            struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1277,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
        __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
                     "profile buffer not large enough"))
                return;
-        entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
        if (!entry)
                return;
@@ -1291,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
        for (i = 0; i < tp->nr_args; i++)
                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-        ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
+        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+                               irq_flags, regs);
 }
-static int probe_profile_enable(struct ftrace_event_call *call)
+static int probe_perf_enable(struct ftrace_event_call *call)
 {
        struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1306,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
                return enable_kprobe(&tp->rp.kp);
 }
-static void probe_profile_disable(struct ftrace_event_call *call)
+static void probe_perf_disable(struct ftrace_event_call *call)
 {
        struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1331,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
                kprobe_trace_func(kp, regs);
 #ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
-                kprobe_profile_func(kp, regs);
+                kprobe_perf_func(kp, regs);
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1345,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
                kretprobe_trace_func(ri, regs);
 #ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
-                kretprobe_profile_func(ri, regs);
+                kretprobe_perf_func(ri, regs);
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1359,30 +1340,33 @@ static int register_probe_event(struct trace_probe *tp)
        if (probe_is_return(tp)) {
                tp->event.trace = print_kretprobe_event;
                call->raw_init = probe_event_raw_init;
-                call->show_format = kretprobe_event_show_format;
                call->define_fields = kretprobe_event_define_fields;
        } else {
                tp->event.trace = print_kprobe_event;
                call->raw_init = probe_event_raw_init;
-                call->show_format = kprobe_event_show_format;
                call->define_fields = kprobe_event_define_fields;
        }
+        if (set_print_fmt(tp) < 0)
+                return -ENOMEM;
        call->event = &tp->event;
        call->id = register_ftrace_event(&tp->event);
-        if (!call->id)
+        if (!call->id) {
+                kfree(call->print_fmt);
                return -ENODEV;
+        }
        call->enabled = 0;
        call->regfunc = probe_event_enable;
        call->unregfunc = probe_event_disable;
 #ifdef CONFIG_PERF_EVENTS
-        call->profile_enable = probe_profile_enable;
+        call->perf_event_enable = probe_perf_enable;
-        call->profile_disable = probe_profile_disable;
+        call->perf_event_disable = probe_perf_disable;
 #endif
        call->data = tp;
        ret = trace_add_event_call(call);
        if (ret) {
                pr_info("Failed to register kprobe event: %s\n", call->name);
+                kfree(call->print_fmt);
                unregister_ftrace_event(&tp->event);
        }
        return ret;
@@ -1392,6 +1376,7 @@ static void unregister_probe_event(struct trace_probe *tp)
 {
        /* tp->event is unregistered in trace_remove_event_call() */
        trace_remove_event_call(&tp->call);
+        kfree(tp->call.print_fmt);
 }
 /* Make a debugfs interface for controling probe points */
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 94103cdcf9d8..d59cd6879477 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -23,6 +23,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include "trace_output.h"
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <asm/atomic.h>
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..81003b4d617f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
 #include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 static inline int trace_valid_entry(struct trace_entry *entry)
 {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        unsigned long val, flags;
        char buf[64];
        int ret;
+        int cpu;
        if (count >= sizeof(buf))
                return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
                return ret;
        local_irq_save(flags);
+        /*
+         * In case we trace inside arch_spin_lock() or after (NMI),
+         * we will cause circular lock, so we also need to increase
+         * the percpu trace_active here.
+         */
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&max_stack_lock);
        *ptr = val;
        arch_spin_unlock(&max_stack_lock);
+        per_cpu(trace_active, cpu)--;
        local_irq_restore(flags);
        return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+        int cpu;
        local_irq_disable();
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&max_stack_lock);
        if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void t_stop(struct seq_file *m, void *p)
 {
+        int cpu;
        arch_spin_unlock(&max_stack_lock);
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)--;
        local_irq_enable();
 }
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4e332b9e449c..4d6d711717f2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
 #include <trace/syscall.h>
 #include <trace/events/syscalls.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
@@ -143,70 +144,65 @@ extern char *__bad_type_size(void);
                #type, #name, offsetof(typeof(trace), name),            \
                sizeof(trace.name), is_signed_type(type)
-int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
+static
+int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
        int i;
-        int ret;
+        int pos = 0;
-        struct syscall_metadata *entry = call->data;
-        struct syscall_trace_enter trace;
-        int offset = offsetof(struct syscall_trace_enter, args);
-        ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+        /* When len=0, we just calculate the needed length */
-                               "\tsigned:%u;\n",
+#define LEN_OR_ZERO (len ? len - pos : 0)
-                               SYSCALL_FIELD(int, nr));
-        if (!ret)
-                return 0;
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
-                                        entry->args[i]);
+                                entry->args[i], sizeof(unsigned long),
-                if (!ret)
+                                i == entry->nb_args - 1 ? "" : ", ");
-                        return 0;
-                ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
-                                       "\tsigned:%u;\n", offset,
-                                       sizeof(unsigned long),
-                                       is_signed_type(unsigned long));
-                if (!ret)
-                        return 0;
-                offset += sizeof(unsigned long);
        }
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
-        trace_seq_puts(s, "\nprint fmt: \"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO,
-                                        sizeof(unsigned long),
+                                ", ((unsigned long)(REC->%s))", entry->args[i]);
-                                        i == entry->nb_args - 1 ? "" : ", ");
-                if (!ret)
-                        return 0;
        }
-        trace_seq_putc(s, '"');
-        for (i = 0; i < entry->nb_args; i++) {
+#undef LEN_OR_ZERO
-                ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
-                                       entry->args[i]);
-                if (!ret)
-                        return 0;
-        }
-        return trace_seq_putc(s, '\n');
+        /* return the length of print_fmt */
+        return pos;
 }
-int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
+static int set_syscall_print_fmt(struct ftrace_event_call *call)
 {
-        int ret;
+        char *print_fmt;
-        struct syscall_trace_exit trace;
+        int len;
+        struct syscall_metadata *entry = call->data;
-        ret = trace_seq_printf(s,
+        if (entry->enter_event != call) {
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+                call->print_fmt = "\"0x%lx\", REC->ret";
-                               "\tsigned:%u;\n"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-                               "\tsigned:%u;\n",
-                               SYSCALL_FIELD(int, nr),
-                               SYSCALL_FIELD(long, ret));
-        if (!ret)
                return 0;
+        }
-        return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
+        /* First: called with 0 length to calculate the needed length */
+        len = __set_enter_print_fmt(entry, NULL, 0);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
+        /* Second: actually write the @print_fmt */
+        __set_enter_print_fmt(entry, print_fmt, len + 1);
+        call->print_fmt = print_fmt;
+        return 0;
+}
+static void free_syscall_print_fmt(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
+        if (entry->enter_event == call)
+                kfree(call->print_fmt);
 }
 int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +382,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
-        id = register_ftrace_event(call->event);
+        if (set_syscall_print_fmt(call) < 0)
-        if (!id)
+                return -ENOMEM;
-                return -ENODEV;
-        call->id = id;
+        id = trace_event_raw_init(call);
-        INIT_LIST_HEAD(&call->fields);
-        return 0;
+        if (id < 0) {
+                free_syscall_print_fmt(call);
+                return id;
+        }
+        return id;
+}
+unsigned long __init arch_syscall_addr(int nr)
+{
+        return (unsigned long)sys_call_table[nr];
 }
 int __init init_ftrace_syscalls(void)
@@ -423,12 +429,12 @@ core_initcall(init_ftrace_syscalls);
 #ifdef CONFIG_PERF_EVENTS
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
-static int sys_prof_refcount_enter;
+static int sys_perf_refcount_enter;
-static int sys_prof_refcount_exit;
+static int sys_perf_refcount_exit;
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
@@ -438,7 +444,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -450,11 +456,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                      "profile buffer not large enough"))
+                      "perf buffer not large enough"))
                return;
-        rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
                                sys_data->enter_event->id, &rctx, &flags);
        if (!rec)
                return;
@@ -462,10 +468,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-        ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
-int prof_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -473,34 +479,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                ret = register_trace_sys_enter(prof_syscall_enter);
+                ret = register_trace_sys_enter(perf_syscall_enter);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall entry trace point");
        } else {
-                set_bit(num, enabled_prof_enter_syscalls);
+                set_bit(num, enabled_perf_enter_syscalls);
-                sys_prof_refcount_enter++;
+                sys_perf_refcount_enter++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void prof_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_enter--;
+        sys_perf_refcount_enter--;
-        clear_bit(num, enabled_prof_enter_syscalls);
+        clear_bit(num, enabled_perf_enter_syscalls);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                unregister_trace_sys_enter(prof_syscall_enter);
+                unregister_trace_sys_enter(perf_syscall_enter);
        mutex_unlock(&syscall_trace_lock);
 }
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
@@ -510,7 +516,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -525,11 +531,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
         * Impossible, but be paranoid with the future
         * How to put this check outside runtime?
         */
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                "exit event has grown above profile buffer size"))
+                "exit event has grown above perf buffer size"))
                return;
-        rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
                                sys_data->exit_event->id, &rctx, &flags);
        if (!rec)
                return;
@@ -537,10 +543,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
-        ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
-int prof_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -548,30 +554,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                ret = register_trace_sys_exit(prof_syscall_exit);
+                ret = register_trace_sys_exit(perf_syscall_exit);
        if (ret) {
                pr_info("event trace: Could not activate"
-                                "syscall entry trace point");
+                                "syscall exit trace point");
        } else {
-                set_bit(num, enabled_prof_exit_syscalls);
+                set_bit(num, enabled_perf_exit_syscalls);
-                sys_prof_refcount_exit++;
+                sys_perf_refcount_exit++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void prof_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_exit--;
+        sys_perf_refcount_exit--;
-        clear_bit(num, enabled_prof_exit_syscalls);
+        clear_bit(num, enabled_perf_exit_syscalls);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                unregister_trace_sys_exit(prof_syscall_exit);
+                unregister_trace_sys_exit(perf_syscall_exit);
        mutex_unlock(&syscall_trace_lock);
 }
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..cc2d2faa7d9e 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
 #include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/kref.h>
 #include "trace_stat.h"
 #include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
+#include <linux/mm.h>
 /*
 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
        .user_ns        = &init_user_ns,
-#ifdef CONFIG_USER_SCHED
-        .tg             = &init_task_group,
-#endif
 };
 /*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
        put_user_ns(up->user_ns);
 }
-#ifdef CONFIG_USER_SCHED
-static void sched_destroy_user(struct user_struct *up)
-{
-        sched_destroy_group(up->tg);
-}
-static int sched_create_user(struct user_struct *up)
-{
-        int rc = 0;
-        up->tg = sched_create_group(&root_task_group);
-        if (IS_ERR(up->tg))
-                rc = -ENOMEM;
-        set_tg_uid(up);
-        return rc;
-}
-#else   /* CONFIG_USER_SCHED */
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-#endif  /* CONFIG_USER_SCHED */
-#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
-                        /* possibly resurrect an "almost deleted" object */
-                        if (atomic_inc_return(&user->__count) == 1)
-                                cancel_delayed_work(&user->work);
-                        return user;
-                }
-        }
-        return NULL;
-}
-static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
-static DEFINE_MUTEX(uids_mutex);
-static inline void uids_mutex_lock(void)
-{
-        mutex_lock(&uids_mutex);
-}
-static inline void uids_mutex_unlock(void)
-{
-        mutex_unlock(&uids_mutex);
-}
-/* uid directory attributes */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static ssize_t cpu_shares_show(struct kobject *kobj,
-                               struct kobj_attribute *attr,
-                               char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
-}
-static ssize_t cpu_shares_store(struct kobject *kobj,
-                                struct kobj_attribute *attr,
-                                const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long shares;
-        int rc;
-        sscanf(buf, "%lu", &shares);
-        rc = sched_group_set_shares(up->tg, shares);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_share_attr =
-        __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
-}
-static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_runtime;
-        int rc;
-        sscanf(buf, "%ld", &rt_runtime);
-        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_runtime_attr =
-        __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
-static ssize_t cpu_rt_period_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
-}
-static ssize_t cpu_rt_period_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_period;
-        int rc;
-        sscanf(buf, "%lu", &rt_period);
-        rc = sched_group_set_rt_period(up->tg, rt_period);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_period_attr =
-        __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
-#endif
-/* default attributes per uid directory */
-static struct attribute *uids_attributes[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        &cpu_share_attr.attr,
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        &cpu_rt_runtime_attr.attr,
-        &cpu_rt_period_attr.attr,
-#endif
-        NULL
-};
-/* the lifetime of user_struct is not managed by the core (now) */
-static void uids_release(struct kobject *kobj)
-{
-        return;
-}
-static struct kobj_type uids_ktype = {
-        .sysfs_ops = &kobj_sysfs_ops,
-        .default_attrs = uids_attributes,
-        .release = uids_release,
-};
-/*
- * Create /sys/kernel/uids/<uid>/cpu_share file for this user
- * We do not create this file for users in a user namespace (until
- * sysfs tagging is implemented).
- *
- * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
- */
-static int uids_user_create(struct user_struct *up)
-{
-        struct kobject *kobj = &up->kobj;
-        int error;
-        memset(kobj, 0, sizeof(struct kobject));
-        if (up->user_ns != &init_user_ns)
-                return 0;
-        kobj->kset = uids_kset;
-        error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
-        if (error) {
-                kobject_put(kobj);
-                goto done;
-        }
-        kobject_uevent(kobj, KOBJ_ADD);
-done:
-        return error;
-}
-/* create these entries in sysfs:
- *      "/sys/kernel/uids" directory
- *      "/sys/kernel/uids/0" directory (for root user)
- *      "/sys/kernel/uids/0/cpu_share" file (for root user)
- */
-int __init uids_sysfs_init(void)
-{
-        uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
-        if (!uids_kset)
-                return -ENOMEM;
-        return uids_user_create(&root_user);
-}
-/* delayed work function to remove sysfs directory for a user and free up
- * corresponding structures.
- */
-static void cleanup_user_struct(struct work_struct *w)
-{
-        struct user_struct *up = container_of(w, struct user_struct, work.work);
-        unsigned long flags;
-        int remove_user = 0;
-        /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
-         * atomic.
-         */
-        uids_mutex_lock();
-        spin_lock_irqsave(&uidhash_lock, flags);
-        if (atomic_read(&up->__count) == 0) {
-                uid_hash_remove(up);
-                remove_user = 1;
-        }
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-        if (!remove_user)
-                goto done;
-        if (up->user_ns == &init_user_ns) {
-                kobject_uevent(&up->kobj, KOBJ_REMOVE);
-                kobject_del(&up->kobj);
-                kobject_put(&up->kobj);
-        }
-        sched_destroy_user(up);
-        key_put(up->uid_keyring);
-        key_put(up->session_keyring);
-        kmem_cache_free(uid_cachep, up);
-done:
-        uids_mutex_unlock();
-}
-/* IRQs are disabled and uidhash_lock is held upon function entry.
- * IRQ state (as stored in flags) is restored and uidhash_lock released
- * upon function exit.
- */
-static void free_user(struct user_struct *up, unsigned long flags)
-{
-        INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-        schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-}
-#else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
 static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
        return NULL;
 }
-int uids_sysfs_init(void) { return 0; }
-static inline int uids_user_create(struct user_struct *up) { return 0; }
-static inline void uids_mutex_lock(void) { }
-static inline void uids_mutex_unlock(void) { }
 /* IRQs are disabled and uidhash_lock is held upon function entry.
 * IRQ state (as stored in flags) is restored and uidhash_lock released
 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
-        sched_destroy_user(up);
        key_put(up->uid_keyring);
        key_put(up->session_keyring);
        kmem_cache_free(uid_cachep, up);
 }
-#endif
-#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
-/*
- * We need to check if a setuid can take place. This function should be called
- * before successfully completing the setuid.
- */
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return sched_rt_can_attach(up->tg, tsk);
-}
-#else
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return 1;
-}
-#endif
 /*
 * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
         * atomic.
         */
-        uids_mutex_lock();
        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                new->uid = uid;
                atomic_set(&new->__count, 1);
-                if (sched_create_user(new) < 0)
-                        goto out_free_user;
                new->user_ns = get_user_ns(ns);
-                if (uids_user_create(new))
-                        goto out_destoy_sched;
                /*
                 * Before adding this, check whether we raced
                 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_unlock_irq(&uidhash_lock);
        }
-        uids_mutex_unlock();
        return up;
-out_destoy_sched:
-        sched_destroy_user(new);
        put_user_ns(new->user_ns);
-out_free_user:
        kmem_cache_free(uid_cachep, new);
 out_unlock:
-        uids_mutex_unlock();
        return NULL;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..5bfb213984b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork)
 {
        if (del_timer_sync(&dwork->timer)) {
                struct cpu_workqueue_struct *cwq;
-                cwq = wq_per_cpu(keventd_wq, get_cpu());
+                cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
                __queue_work(cwq, &dwork->work);
                put_cpu();
        }
author	Frederic Weisbecker <fweisbec@gmail.com>	2010-05-12 17:19:01 -0400
committer	Frederic Weisbecker <fweisbec@gmail.com>	2010-05-12 17:20:33 -0400
commit	a9aa1d02de36b450990b0e25a88fc2ff1c3e6b94 (patch)
tree	1f9d19f1642d263e65906a916a48be9339accc73 /kernel
parent	5671a10e2bc7f99d9157c6044faf8be2ef302361 (diff)
parent	b57f95a38233a2e73b679bea4a5453a1cc2a1cc9 (diff)