Merge commit 'v2.6.35' into kbuild/kbuild

Conflicts: arch/powerpc/Makefile
author: Michal Marek <mmarek@suse.cz> 2010-08-04 07:59:13 -0400
committer: Michal Marek <mmarek@suse.cz> 2010-08-04 07:59:13 -0400
commit: 772320e84588dcbe1600ffb83e5f328f2209ac2a (patch)
tree: a7de21b79340aeaa17c58126f6b801b82c77b53a /kernel
parent: 1ce53adf13a54375d2a5c7cdbe341b2558389615 (diff)
parent: 9fe6206f400646a2322096b56c59891d530e8d51 (diff)
169 files changed, 22169 insertions, 10731 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..057472fbc272 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o
+            async.o range.o
+obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -67,14 +68,14 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
-obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
@@ -100,6 +104,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..385b88461c29 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
 {
        struct file *file;
        struct vfsmount *mnt;
-        int error;
        struct pid_namespace *ns;
        struct bsd_acct_struct *acct = NULL;
@@ -244,13 +243,6 @@ static int acct_on(char *name)
                }
        }
-        error = security_acct(file);
-        if (error) {
-                kfree(acct);
-                filp_close(file, NULL);
-                return error;
-        }
        spin_lock(&acct_lock);
        if (ns->bacct == NULL) {
                ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
 */
 SYSCALL_DEFINE1(acct, const char __user *, name)
 {
-        int error;
+        int error = 0;
        if (!capable(CAP_SYS_PACCT))
                return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
                if (acct == NULL)
                        return 0;
-                error = security_acct(NULL);
+                spin_lock(&acct_lock);
-                if (!error) {
+                acct_file_reopen(acct, NULL, NULL);
-                        spin_lock(&acct_lock);
+                spin_unlock(&acct_lock);
-                        acct_file_reopen(acct, NULL, NULL);
-                        spin_unlock(&acct_lock);
-                }
        }
        return error;
 }
@@ -353,17 +343,18 @@ restart:
 void acct_exit_ns(struct pid_namespace *ns)
 {
-        struct bsd_acct_struct *acct;
+        struct bsd_acct_struct *acct = ns->bacct;
-        spin_lock(&acct_lock);
+        if (acct == NULL)
-        acct = ns->bacct;
+                return;
-        if (acct != NULL) {
-                if (acct->file != NULL)
-                        acct_file_reopen(acct, NULL, NULL);
-                kfree(acct);
+        del_timer_sync(&acct->timer);
-        }
+        spin_lock(&acct_lock);
+        if (acct->file != NULL)
+                acct_file_reopen(acct, NULL, NULL);
        spin_unlock(&acct_lock);
+        kfree(acct);
 }
 /*
@@ -588,16 +579,6 @@ out:
 }
 /**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
-        memset(pacct, 0, sizeof(struct pacct_struct));
-        pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-/**
 * acct_collect - collect accounting information into pacct_struct
 * @exitcode: task exit code
 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        skb_get(skb);
        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
        if (err < 0) {
-                BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
                audit_log_lost("auditd dissapeared\n");
                audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 struct audit_tree;
 struct audit_chunk;
@@ -548,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
        return 0;
 }
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+        return mnt->mnt_root->d_inode == arg;
+}
 void audit_trim_trees(void)
 {
        struct list_head cursor;
@@ -559,7 +565,6 @@ void audit_trim_trees(void)
                struct path path;
                struct vfsmount *root_mnt;
                struct node *node;
-                struct list_head list;
                int err;
                tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +582,16 @@ void audit_trim_trees(void)
                if (!root_mnt)
                        goto skip_it;
-                list_add_tail(&list, &root_mnt->mnt_list);
                spin_lock(&hash_lock);
                list_for_each_entry(node, &tree->chunks, list) {
-                        struct audit_chunk *chunk = find_chunk(node);
+                        struct inode *inode = find_chunk(node)->watch.inode;
-                        struct inode *inode = chunk->watch.inode;
-                        struct vfsmount *mnt;
                        node->index |= 1U<<31;
-                        list_for_each_entry(mnt, &list, mnt_list) {
+                        if (iterate_mounts(compare_root, inode, root_mnt))
-                                if (mnt->mnt_root->d_inode == inode) {
+                                node->index &= ~(1U<<31);
-                                        node->index &= ~(1U<<31);
-                                        break;
-                                }
-                        }
                }
                spin_unlock(&hash_lock);
                trim_marked(tree);
                put_tree(tree);
-                list_del_init(&list);
                drop_collected_mounts(root_mnt);
 skip_it:
                mutex_lock(&audit_filter_mutex);
@@ -603,22 +600,6 @@ skip_it:
        mutex_unlock(&audit_filter_mutex);
 }
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
-                    struct path *path)
-{
-        if (mnt != path->mnt) {
-                for (;;) {
-                        if (mnt->mnt_parent == mnt)
-                                return 0;
-                        if (mnt->mnt_parent == path->mnt)
-                                        break;
-                        mnt = mnt->mnt_parent;
-                }
-                dentry = mnt->mnt_mountpoint;
-        }
-        return is_subdir(dentry, path->dentry);
-}
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 {
@@ -638,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree)
        put_tree(tree);
 }
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+        return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
 /* called with audit_filter_mutex */
 int audit_add_tree_rule(struct audit_krule *rule)
 {
        struct audit_tree *seed = rule->tree, *tree;
        struct path path;
-        struct vfsmount *mnt, *p;
+        struct vfsmount *mnt;
-        struct list_head list;
        int err;
        list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
                err = -ENOMEM;
                goto Err;
        }
-        list_add_tail(&list, &mnt->mnt_list);
        get_tree(tree);
-        list_for_each_entry(p, &list, mnt_list) {
+        err = iterate_mounts(tag_mount, tree, mnt);
-                err = tag_chunk(p->mnt_root->d_inode, tree);
-                if (err)
-                        break;
-        }
-        list_del(&list);
        drop_collected_mounts(mnt);
        if (!err) {
@@ -714,31 +692,23 @@ int audit_tag_tree(char *old, char *new)
 {
        struct list_head cursor, barrier;
        int failed = 0;
-        struct path path;
+        struct path path1, path2;
        struct vfsmount *tagged;
-        struct list_head list;
-        struct vfsmount *mnt;
-        struct dentry *dentry;
        int err;
-        err = kern_path(new, 0, &path);
+        err = kern_path(new, 0, &path2);
        if (err)
                return err;
-        tagged = collect_mounts(&path);
+        tagged = collect_mounts(&path2);
-        path_put(&path);
+        path_put(&path2);
        if (!tagged)
                return -ENOMEM;
-        err = kern_path(old, 0, &path);
+        err = kern_path(old, 0, &path1);
        if (err) {
                drop_collected_mounts(tagged);
                return err;
        }
-        mnt = mntget(path.mnt);
-        dentry = dget(path.dentry);
-        path_put(&path);
-        list_add_tail(&list, &tagged->mnt_list);
        mutex_lock(&audit_filter_mutex);
        list_add(&barrier, &tree_list);
@@ -746,7 +716,7 @@ int audit_tag_tree(char *old, char *new)
        while (cursor.next != &tree_list) {
                struct audit_tree *tree;
-                struct vfsmount *p;
+                int good_one = 0;
                tree = container_of(cursor.next, struct audit_tree, list);
                get_tree(tree);
@@ -754,30 +724,19 @@ int audit_tag_tree(char *old, char *new)
                list_add(&cursor, &tree->list);
                mutex_unlock(&audit_filter_mutex);
-                err = kern_path(tree->pathname, 0, &path);
+                err = kern_path(tree->pathname, 0, &path2);
-                if (err) {
+                if (!err) {
-                        put_tree(tree);
+                        good_one = path_is_under(&path1, &path2);
-                        mutex_lock(&audit_filter_mutex);
+                        path_put(&path2);
-                        continue;
                }
-                spin_lock(&vfsmount_lock);
+                if (!good_one) {
-                if (!is_under(mnt, dentry, &path)) {
-                        spin_unlock(&vfsmount_lock);
-                        path_put(&path);
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
                        continue;
                }
-                spin_unlock(&vfsmount_lock);
-                path_put(&path);
-                list_for_each_entry(p, &list, mnt_list) {
-                        failed = tag_chunk(p->mnt_root->d_inode, tree);
-                        if (failed)
-                                break;
-                }
+                failed = iterate_mounts(tag_mount, tree, tagged);
                if (failed) {
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
@@ -818,10 +777,8 @@ int audit_tag_tree(char *old, char *new)
        }
        list_del(&barrier);
        list_del(&cursor);
-        list_del(&list);
        mutex_unlock(&audit_filter_mutex);
-        dput(dentry);
+        path_put(&path1);
-        mntput(mnt);
        drop_collected_mounts(tagged);
        return failed;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
 #include <linux/mqueue.h>
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
 {
        if (context->name_count >= AUDIT_NAMES) {
                if (inode)
-                        printk(KERN_DEBUG "name_count maxed, losing inode data: "
+                        printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
                               "dev=%02x:%02x, inode=%lu\n",
                               MAJOR(inode->i_sb->s_dev),
                               MINOR(inode->i_sb->s_dev),
@@ -1988,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 /**
 * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
 * @dentry: dentry being audited
 * @parent: inode of dentry parent
 *
@@ -2000,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 * must be hooked prior, in order to capture the target inode during
 * unsuccessful attempts.
 */
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
+void __audit_inode_child(const struct dentry *dentry,
                         const struct inode *parent)
 {
        int idx;
        struct audit_context *context = current->audit_context;
        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
+        const char *dname = dentry->d_name.name;
        int dirlen = 0;
        if (!context->in_syscall)
@@ -2014,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
        if (inode)
                handle_one(inode);
-        /* determine matching parent */
-        if (!dname)
-                goto add_names;
        /* parent is more likely, look for it first */
        for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-#include "cred-internals.h"
 /*
 * Leveraged for setting/resetting capabilities
@@ -135,7 +134,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
        if (pid && (pid != task_pid_vnr(current))) {
                struct task_struct *target;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                target = find_task_by_vpid(pid);
                if (!target)
@@ -143,7 +142,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                else
                        ret = security_capget(target, pEp, pIp, pPp);
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044a..3ac6f5b0a64b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
+#include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
@@ -51,15 +56,21 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <asm/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
 #define SUBSYS(_x) &_x ## _subsys,
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
-static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
@@ -146,6 +157,35 @@ struct css_id {
        unsigned short stack[0]; /* Array of Length (depth+1) */
 };
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+        /*
+         * Cgroup which the event belongs to.
+         */
+        struct cgroup *cgrp;
+        /*
+         * Control file which the event associated.
+         */
+        struct cftype *cft;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 /* The list of hierarchy roots */
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 */
 static int need_forkexit_callback __read_mostly;
+#ifdef CONFIG_PROVE_LOCKING
+int cgroup_lock_is_held(void)
+{
+        return lockdep_is_held(&cgroup_mutex);
+}
+#else /* #ifdef CONFIG_PROVE_LOCKING */
+int cgroup_lock_is_held(void)
+{
+        return mutex_is_locked(&cgroup_mutex);
+}
+#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
+EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+                           struct cgroup_subsys_state *css);
 /* css_set_lock protects the list of css_set objects, and the
 * chain of tasks off each css_set.  Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
        struct hlist_node *node;
        struct css_set *cg;
-        /* Built the set of subsystem state objects that we want to
+        /*
-         * see in the new css_set */
+         * Build the set of subsystem state objects that we want to see in the
+         * new css_set. while subsystems can change globally, the entries here
+         * won't change, so no need for locking.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                if (root->subsys_bits & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
 {
        mutex_lock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_lock);
 /**
 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
 {
        mutex_unlock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_unlock);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
                        if (ret)
                                break;
                }
        return ret;
 }
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
        css_put(css);
 }
+/*
+ * Call with cgroup_mutex held. Drops reference counts on modules, including
+ * any duplicate ones that parse_cgroupfs_options took. If this function
+ * returns an error, no reference counts are touched.
+ */
 static int rebind_subsystems(struct cgroupfs_root *root,
                              unsigned long final_bits)
 {
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        struct cgroup *cgrp = &root->top_cgroup;
        int i;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        removed_bits = root->actual_subsys_bits & ~final_bits;
        added_bits = final_bits & ~root->actual_subsys_bits;
        /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                struct cgroup_subsys *ss = subsys[i];
                if (!(bit & added_bits))
                        continue;
+                /*
+                 * Nobody should tell us to do a subsys that doesn't exist:
+                 * parse_cgroupfs_options should catch that case and refcounts
+                 * ensure that subsystems won't disappear once selected.
+                 */
+                BUG_ON(ss == NULL);
                if (ss->root != &rootnode) {
                        /* Subsystem isn't free */
                        return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                unsigned long bit = 1UL << i;
                if (bit & added_bits) {
                        /* We're binding this subsystem to this hierarchy */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        if (ss->bind)
                                ss->bind(ss, cgrp);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                        mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* subsystem is now free - drop reference on module */
+                        module_put(ss->module);
                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
+                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
+                        /*
+                         * a refcount was taken, but we already had one, so
+                         * drop the extra reference.
+                         */
+                        module_put(ss->module);
+#ifdef CONFIG_MODULE_UNLOAD
+                        BUG_ON(ss->module && !module_refcount(ss->module));
+#endif
                } else {
                        /* Subsystem state shouldn't exist */
                        BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
 };
-/* Convert a hierarchy specifier into a bitmask of subsystems and
+/*
- * flags. */
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
-static int parse_cgroupfs_options(char *data,
+ * with cgroup_mutex held to protect the subsys[] array. This function takes
-                                     struct cgroup_sb_opts *opts)
+ * refcounts on subsystems to be used, unless it returns error, in which case
+ * no refcounts are taken.
+ */
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data ?: "all";
        unsigned long mask = (unsigned long)-1;
+        int i;
+        bool module_pin_failed = false;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
        mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
                        return -EINVAL;
                if (!strcmp(token, "all")) {
                        /* Add all non-disabled subsystems */
-                        int i;
                        opts->subsys_bits = 0;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                struct cgroup_subsys *ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!ss->disabled)
                                        opts->subsys_bits |= 1ul << i;
                        }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
                        if (!opts->release_agent)
                                return -ENOMEM;
                } else if (!strncmp(token, "name=", 5)) {
-                        int i;
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
                                return -ENOMEM;
                } else {
                        struct cgroup_subsys *ss;
-                        int i;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!strcmp(token, ss->name)) {
                                        if (!ss->disabled)
                                                set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
        if (!opts->subsys_bits && !opts->name)
                return -EINVAL;
+        /*
+         * Grab references on all the modules we'll need, so the subsystems
+         * don't dance around before rebind_subsystems attaches them. This may
+         * take duplicate reference counts on a subsystem that's already used,
+         * but rebind_subsystems handles this case.
+         */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & opts->subsys_bits))
+                        continue;
+                if (!try_module_get(subsys[i]->module)) {
+                        module_pin_failed = true;
+                        break;
+                }
+        }
+        if (module_pin_failed) {
+                /*
+                 * oops, one of the modules was going away. this means that we
+                 * raced with a module_delete call, and to the user this is
+                 * essentially a "subsystem doesn't exist" case.
+                 */
+                for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+                        /* drop refcounts only on the ones we took */
+                        unsigned long bit = 1UL << i;
+                        if (!(bit & opts->subsys_bits))
+                                continue;
+                        module_put(subsys[i]->module);
+                }
+                return -ENOENT;
+        }
        return 0;
 }
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+{
+        int i;
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & subsys_bits))
+                        continue;
+                module_put(subsys[i]->module);
+        }
+}
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
        int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        /* Don't allow flags to change at remount */
+        /* Don't allow flags or name to change at remount */
-        if (opts.flags != root->flags) {
+        if (opts.flags != root->flags ||
-                ret = -EINVAL;
+            (opts.name && strcmp(opts.name, root->name))) {
-                goto out_unlock;
-        }
-        /* Don't allow name to change at remount */
-        if (opts.name && strcmp(opts.name, root->name)) {
                ret = -EINVAL;
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
        }
        ret = rebind_subsystems(root, opts.subsys_bits);
-        if (ret)
+        if (ret) {
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
+        }
        /* (re)populate subsystem files */
        cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
+        INIT_LIST_HEAD(&cgrp->event_list);
+        spin_lock_init(&cgrp->event_list_lock);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct cgroupfs_root *new_root;
        /* First find the desired set of subsystems */
+        mutex_lock(&cgroup_mutex);
        ret = parse_cgroupfs_options(data, &opts);
+        mutex_unlock(&cgroup_mutex);
        if (ret)
                goto out_err;
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        new_root = cgroup_root_from_opts(&opts);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
-                goto out_err;
+                goto drop_modules;
        }
        opts.new_root = new_root;
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
-                goto out_err;
+                goto drop_modules;
        }
        root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                        free_cg_links(&tmp_cg_links);
                        goto drop_new_super;
                }
+                /*
+                 * There must be no failure case after here, since rebinding
+                 * takes care of subsystems' refcounts, which are explicitly
+                 * dropped in the failure exit path.
+                 */
                /* EBUSY should be the only error here */
                BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 * any) is not needed
                 */
                cgroup_drop_root(opts.new_root);
+                /* no subsys rebinding, so refcounts don't change */
+                drop_parsed_module_refcounts(opts.subsys_bits);
        }
        simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 drop_new_super:
        deactivate_locked_super(sb);
+ drop_modules:
+        drop_parsed_module_refcounts(opts.subsys_bits);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
-        struct dentry *dentry = rcu_dereference(cgrp->dentry);
+        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
+                                                      rcu_read_lock_held() ||
+                                                      cgroup_lock_is_held());
        if (!dentry || cgrp == dummytop) {
                /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        *--start = '\0';
        for (;;) {
                int len = dentry->d_name.len;
                if ((start -= len) < buf)
                        return -ENAMETOOLONG;
-                memcpy(start, cgrp->dentry->d_name.name, len);
+                memcpy(start, dentry->d_name.name, len);
                cgrp = cgrp->parent;
                if (!cgrp)
                        break;
-                dentry = rcu_dereference(cgrp->dentry);
+                dentry = rcu_dereference_check(cgrp->dentry,
+                                               rcu_read_lock_held() ||
+                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        memmove(buf, start, buf + buflen - start);
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_path);
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        int retval = 0;
-        struct cgroup_subsys *ss;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct css_set *cg;
        struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(ss, cgrp, tsk, false);
-                        if (retval)
+                        if (retval) {
-                                return retval;
+                                /*
+                                 * Remember on which subsystem the can_attach()
+                                 * failed, so that we only call cancel_attach()
+                                 * against the subsystems whose can_attach()
+                                 * succeeded. (See below)
+                                 */
+                                failed_ss = ss;
+                                goto out;
+                        }
                }
        }
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         */
        newcg = find_css_set(cg, cgrp);
        put_css_set(cg);
-        if (!newcg)
+        if (!newcg) {
-                return -ENOMEM;
+                retval = -ENOMEM;
+                goto out;
+        }
        task_lock(tsk);
        if (tsk->flags & PF_EXITING) {
                task_unlock(tsk);
                put_css_set(newcg);
-                return -ESRCH;
+                retval = -ESRCH;
+                goto out;
        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         * is no longer empty.
         */
        cgroup_wakeup_rmdir_waiter(cgrp);
-        return 0;
+out:
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss)
+                                /*
+                                 * This subsystem was the one that failed the
+                                 * can_attach() check earlier, so we don't need
+                                 * to call cancel_attach() against it or any
+                                 * remaining subsystems.
+                                 */
+                                break;
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, tsk, false);
+                }
+        }
+        return retval;
 }
 /*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
        }
        return true;
 }
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+                return ERR_PTR(-EINVAL);
+        return __d_cft(file->f_dentry);
+}
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
                error = PTR_ERR(dentry);
        return error;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_file);
 int cgroup_add_files(struct cgroup *cgrp,
                        struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_files);
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+        struct pid_namespace *ns = current->nsproxy->pid_ns;
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
         * the last ref-holder is trying to remove l from the list at the same
@@ -2463,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        mutex_lock(&cgrp->pidlist_mutex);
        list_for_each_entry(l, &cgrp->pidlists, links) {
                if (l->key.type == type && l->key.ns == ns) {
-                        /* found a matching list - drop the extra refcount */
-                        put_pid_ns(ns);
                        /* make sure l doesn't vanish out from under us */
                        down_write(&l->mutex);
                        mutex_unlock(&cgrp->pidlist_mutex);
@@ -2475,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
        if (!l) {
                mutex_unlock(&cgrp->pidlist_mutex);
-                put_pid_ns(ns);
                return l;
        }
        init_rwsem(&l->mutex);
        down_write(&l->mutex);
        l->key.type = type;
-        l->key.ns = ns;
+        l->key.ns = get_pid_ns(ns);
        l->use_count = 0; /* don't increment here */
        l->list = NULL;
        l->owner = cgrp;
@@ -2789,6 +2984,173 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+        struct cgroup_event *event = container_of(work, struct cgroup_event,
+                        remove);
+        struct cgroup *cgrp = event->cgrp;
+        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        eventfd_ctx_put(event->eventfd);
+        kfree(event);
+        dput(cgrp->dentry);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+                int sync, void *key)
+{
+        struct cgroup_event *event = container_of(wait,
+                        struct cgroup_event, wait);
+        struct cgroup *cgrp = event->cgrp;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                __remove_wait_queue(event->wqh, &event->wait);
+                spin_lock(&cgrp->event_list_lock);
+                list_del(&event->list);
+                spin_unlock(&cgrp->event_list_lock);
+                /*
+                 * We are in atomic context, but cgroup_event_remove() may
+                 * sleep, so we have to call it in workqueue.
+                 */
+                schedule_work(&event->remove);
+        }
+        return 0;
+}
+static void cgroup_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct cgroup_event *event = container_of(pt,
+                        struct cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        struct cgroup_event *event = NULL;
+        unsigned int efd, cfd;
+        struct file *efile = NULL;
+        struct file *cfile = NULL;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->cgrp = cgrp;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+        INIT_WORK(&event->remove, cgroup_event_remove);
+        efile = eventfd_fget(efd);
+        if (IS_ERR(efile)) {
+                ret = PTR_ERR(efile);
+                goto fail;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto fail;
+        }
+        cfile = fget(cfd);
+        if (!cfile) {
+                ret = -EBADF;
+                goto fail;
+        }
+        /* the process need read permission on control file */
+        ret = file_permission(cfile, MAY_READ);
+        if (ret < 0)
+                goto fail;
+        event->cft = __file_cft(cfile);
+        if (IS_ERR(event->cft)) {
+                ret = PTR_ERR(event->cft);
+                goto fail;
+        }
+        if (!event->cft->register_event || !event->cft->unregister_event) {
+                ret = -EINVAL;
+                goto fail;
+        }
+        ret = event->cft->register_event(cgrp, event->cft,
+                        event->eventfd, buffer);
+        if (ret)
+                goto fail;
+        if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+                event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+                ret = 0;
+                goto fail;
+        }
+        /*
+         * Events should be removed after rmdir of cgroup directory, but before
+         * destroying subsystem state objects. Let's take reference to cgroup
+         * directory dentry to do that.
+         */
+        dget(cgrp->dentry);
+        spin_lock(&cgrp->event_list_lock);
+        list_add(&event->list, &cgrp->event_list);
+        spin_unlock(&cgrp->event_list_lock);
+        fput(cfile);
+        fput(efile);
+        return 0;
+fail:
+        if (cfile)
+                fput(cfile);
+        if (event && event->eventfd && !IS_ERR(event->eventfd))
+                eventfd_ctx_put(event->eventfd);
+        if (!IS_ERR_OR_NULL(efile))
+                fput(efile);
+        kfree(event);
+        return ret;
+}
+/*
 * for the common functions, 'private' gives the type of file
 */
 /* for hysterical raisins, we can't put this on the older files */
@@ -2813,6 +3175,11 @@ static struct cftype files[] = {
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
+        {
+                .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+                .write_string = cgroup_write_event_control,
+                .mode = S_IWUGO,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -2877,8 +3244,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
        /* We need to take each hierarchy_mutex in a consistent order */
        int i;
+        /*
+         * No worry about a race with rebind_subsystems that might mess up the
+         * locking order, since both parties are under cgroup_mutex.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_lock(&ss->hierarchy_mutex);
        }
@@ -2890,6 +3263,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_unlock(&ss->hierarchy_mutex);
        }
@@ -2936,14 +3311,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_destroy;
                }
                init_cgroup_css(css, ss, cgrp);
-                if (ss->use_id)
+                if (ss->use_id) {
-                        if (alloc_css_id(ss, parent, cgrp))
+                        err = alloc_css_id(ss, parent, cgrp);
+                        if (err)
                                goto err_destroy;
+                }
                /* At error, ->destroy() callback has to free assigned ID. */
        }
@@ -3010,11 +3388,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
         * synchronization other than RCU, and the subsystem linked
         * list isn't RCU-safe */
        int i;
+        /*
+         * We won't need to lock the subsys array, because the subsystems
+         * we're concerned about aren't going anywhere since our cgroup root
+         * has a reference on them.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
-                /* Skip subsystems not in this hierarchy */
+                /* Skip subsystems not present or not in this hierarchy */
-                if (ss->root != cgrp->root)
+                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
                /* When called from check_for_release() it's possible
@@ -3088,6 +3471,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct dentry *d;
        struct cgroup *parent;
        DEFINE_WAIT(wait);
+        struct cgroup_event *event, *tmp;
        int ret;
        /* the vfs holds both inode->i_mutex already */
@@ -3171,6 +3555,20 @@ again:
        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
+        /*
+         * Unregister events and notify userspace.
+         * Notify userspace about cgroup removing only after rmdir of cgroup
+         * directory to avoid race between userspace and kernelspace
+         */
+        spin_lock(&cgrp->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+                list_del(&event->list);
+                remove_wait_queue(event->wqh, &event->wait);
+                eventfd_signal(event->eventfd, 1);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&cgrp->event_list_lock);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -3205,9 +3603,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        mutex_init(&ss->hierarchy_mutex);
        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
+        /* this function shouldn't be used with modular subsystems, since they
+         * need to register a subsys_id, among other things */
+        BUG_ON(ss->module);
 }
 /**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsystem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+        int i;
+        struct cgroup_subsys_state *css;
+        /* check name and function validity */
+        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
+            ss->create == NULL || ss->destroy == NULL)
+                return -EINVAL;
+        /*
+         * we don't support callbacks in modular subsystems. this check is
+         * before the ss->module check for consistency; a subsystem that could
+         * be a module should still have no callbacks even if the user isn't
+         * compiling it as one.
+         */
+        if (ss->fork || ss->exit)
+                return -EINVAL;
+        /*
+         * an optionally modular subsystem is built-in: we want to do nothing,
+         * since cgroup_init_subsys will have already taken care of it.
+         */
+        if (ss->module == NULL) {
+                /* a few sanity checks */
+                BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+                BUG_ON(subsys[ss->subsys_id] != ss);
+                return 0;
+        }
+        /*
+         * need to register a subsys id before anything else - for example,
+         * init_cgroup_css needs it.
+         */
+        mutex_lock(&cgroup_mutex);
+        /* find the first empty slot in the array */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                if (subsys[i] == NULL)
+                        break;
+        }
+        if (i == CGROUP_SUBSYS_COUNT) {
+                /* maximum number of subsystems already registered! */
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        /* assign ourselves the subsys_id */
+        ss->subsys_id = i;
+        subsys[i] = ss;
+        /*
+         * no ss->create seems to need anything important in the ss struct, so
+         * this can happen first (i.e. before the rootnode attachment).
+         */
+        css = ss->create(ss, dummytop);
+        if (IS_ERR(css)) {
+                /* failure case - need to deassign the subsys[] slot. */
+                subsys[i] = NULL;
+                mutex_unlock(&cgroup_mutex);
+                return PTR_ERR(css);
+        }
+        list_add(&ss->sibling, &rootnode.subsys_list);
+        ss->root = &rootnode;
+        /* our new subsystem will be attached to the dummy hierarchy. */
+        init_cgroup_css(css, ss, dummytop);
+        /* init_idr must be after init_cgroup_css because it sets css->id. */
+        if (ss->use_id) {
+                int ret = cgroup_init_idr(ss, css);
+                if (ret) {
+                        dummytop->subsys[ss->subsys_id] = NULL;
+                        ss->destroy(ss, dummytop);
+                        subsys[i] = NULL;
+                        mutex_unlock(&cgroup_mutex);
+                        return ret;
+                }
+        }
+        /*
+         * Now we need to entangle the css into the existing css_sets. unlike
+         * in cgroup_init_subsys, there are now multiple css_sets, so each one
+         * will need a new pointer to it; done by iterating the css_set_table.
+         * furthermore, modifying the existing css_sets will corrupt the hash
+         * table state, so each changed css_set will need its hash recomputed.
+         * this is all done under the css_set_lock.
+         */
+        write_lock(&css_set_lock);
+        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                struct css_set *cg;
+                struct hlist_node *node, *tmp;
+                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                        /* skip entries that we already rehashed */
+                        if (cg->subsys[ss->subsys_id])
+                                continue;
+                        /* remove existing entry */
+                        hlist_del(&cg->hlist);
+                        /* set new value */
+                        cg->subsys[ss->subsys_id] = css;
+                        /* recompute hash and restore entry */
+                        new_bucket = css_set_hash(cg->subsys);
+                        hlist_add_head(&cg->hlist, new_bucket);
+                }
+        }
+        write_unlock(&css_set_lock);
+        mutex_init(&ss->hierarchy_mutex);
+        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+        ss->active = 1;
+        /* success! */
+        mutex_unlock(&cgroup_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
+/**
+ * cgroup_unload_subsys: unload a modular subsystem
+ * @ss: the subsystem to unload
+ *
+ * This function should be called in a modular subsystem's exitcall. When this
+ * function is invoked, the refcount on the subsystem's module will be 0, so
+ * the subsystem will not be attached to any hierarchy.
+ */
+void cgroup_unload_subsys(struct cgroup_subsys *ss)
+{
+        struct cg_cgroup_link *link;
+        struct hlist_head *hhead;
+        BUG_ON(ss->module == NULL);
+        /*
+         * we shouldn't be called if the subsystem is in use, and the use of
+         * try_module_get in parse_cgroupfs_options should ensure that it
+         * doesn't start being used while we're killing it off.
+         */
+        BUG_ON(ss->root != &rootnode);
+        mutex_lock(&cgroup_mutex);
+        /* deassign the subsys_id */
+        BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
+        subsys[ss->subsys_id] = NULL;
+        /* remove subsystem from rootnode's list of subsystems */
+        list_del(&ss->sibling);
+        /*
+         * disentangle the css from all css_sets attached to the dummytop. as
+         * in loading, we need to pay our respects to the hashtable gods.
+         */
+        write_lock(&css_set_lock);
+        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+                struct css_set *cg = link->cg;
+                hlist_del(&cg->hlist);
+                BUG_ON(!cg->subsys[ss->subsys_id]);
+                cg->subsys[ss->subsys_id] = NULL;
+                hhead = css_set_hash(cg->subsys);
+                hlist_add_head(&cg->hlist, hhead);
+        }
+        write_unlock(&css_set_lock);
+        /*
+         * remove subsystem's css from the dummytop and free it - need to free
+         * before marking as null because ss->destroy needs the cgrp->subsys
+         * pointer to find their state. note that this also takes care of
+         * freeing the css_id.
+         */
+        ss->destroy(ss, dummytop);
+        dummytop->subsys[ss->subsys_id] = NULL;
+        mutex_unlock(&cgroup_mutex);
+}
+EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
+/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
@@ -3235,7 +3822,8 @@ int __init cgroup_init_early(void)
        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&css_set_table[i]);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                BUG_ON(!ss->name);
@@ -3270,12 +3858,13 @@ int __init cgroup_init(void)
        if (err)
                return err;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
-                        cgroup_subsys_init_idr(ss);
+                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
        /* Add init_css_set to the hash table */
@@ -3379,9 +3968,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        int i;
        seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+        /*
+         * ideally we don't want subsystems moving around while we do this.
+         * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+         * subsys/hierarchy state.
+         */
        mutex_lock(&cgroup_mutex);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
@@ -3439,7 +4035,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
        if (need_forkexit_callback) {
                int i;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * forkexit callbacks are only supported for builtin
+                 * subsystems, and the builtin section of the subsys array is
+                 * immutable, so we don't need to lock the subsys array here.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->fork)
                                ss->fork(ss, child);
@@ -3508,7 +4109,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        struct css_set *cg;
        if (run_callbacks && need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * modular subsystems can't use callbacks, so no need to lock
+                 * the subsys array
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->exit)
                                ss->exit(ss, tsk);
@@ -3702,12 +4307,13 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
-void __css_put(struct cgroup_subsys_state *css)
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css, int count)
 {
        struct cgroup *cgrp = css->cgroup;
        int val;
        rcu_read_lock();
-        val = atomic_dec_return(&css->refcnt);
+        val = atomic_sub_return(count, &css->refcnt);
        if (val == 1) {
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3718,6 +4324,7 @@ void __css_put(struct cgroup_subsys_state *css)
        rcu_read_unlock();
        WARN_ON_ONCE(val < 1);
 }
+EXPORT_SYMBOL_GPL(__css_put);
 /*
 * Notify userspace when a cgroup is released, by running the
@@ -3799,8 +4406,11 @@ static int __init cgroup_disable(char *str)
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
+                /*
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                 * cgroup_disable, being at boot time, can't know about module
+                 * subsystems, so we don't worry about them.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (!strcmp(token, ss->name)) {
@@ -3824,31 +4434,65 @@ __setup("cgroup_disable=", cgroup_disable);
 */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
-        struct css_id *cssid = rcu_dereference(css->id);
+        struct css_id *cssid;
+        /*
+         * This css_id() can return correct value when somone has refcnt
+         * on this or this is under rcu_read_lock(). Once css->id is allocated,
+         * it's unchanged until freed.
+         */
+        cssid = rcu_dereference_check(css->id,
+                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->id;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_id);
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
-        struct css_id *cssid = rcu_dereference(css->id);
+        struct css_id *cssid;
+        cssid = rcu_dereference_check(css->id,
+                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->depth;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_depth);
+/**
+ *  css_is_ancestor - test "root" css is an ancestor of "child"
+ * @child: the css to be tested.
+ * @root: the css supporsed to be an ancestor of the child.
+ *
+ * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * But, considering usual usage, the csses should be valid objects after test.
+ * Assuming that the caller will do some action to the child if this returns
+ * returns true, the caller must take "child";s reference count.
+ * If "child" is valid object and this returns true, "root" is valid, too.
+ */
 bool css_is_ancestor(struct cgroup_subsys_state *child,
                    const struct cgroup_subsys_state *root)
 {
-        struct css_id *child_id = rcu_dereference(child->id);
+        struct css_id *child_id;
-        struct css_id *root_id = rcu_dereference(root->id);
+        struct css_id *root_id;
+        bool ret = true;
-        if (!child_id || !root_id || (child_id->depth < root_id->depth))
+        rcu_read_lock();
-                return false;
+        child_id  = rcu_dereference(child->id);
-        return child_id->stack[root_id->depth] == root_id->id;
+        root_id = rcu_dereference(root->id);
+        if (!child_id
+            || !root_id
+            || (child_id->depth < root_id->depth)
+            || (child_id->stack[root_id->depth] != root_id->id))
+                ret = false;
+        rcu_read_unlock();
+        return ret;
 }
 static void __free_css_id_cb(struct rcu_head *head)
@@ -3875,6 +4519,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_unlock(&ss->id_lock);
        call_rcu(&id->rcu_head, __free_css_id_cb);
 }
+EXPORT_SYMBOL_GPL(free_css_id);
 /*
 * This is called by init or create(). Then, calls to this function are
@@ -3924,15 +4569,14 @@ err_out:
 }
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+                                            struct cgroup_subsys_state *rootcss)
 {
        struct css_id *newid;
-        struct cgroup_subsys_state *rootcss;
        spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
-        rootcss = init_css_set.subsys[ss->subsys_id];
        newid = get_new_cssid(ss, 0);
        if (IS_ERR(newid))
                return PTR_ERR(newid);
@@ -3948,13 +4592,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 {
        int subsys_id, i, depth = 0;
        struct cgroup_subsys_state *parent_css, *child_css;
-        struct css_id *child_id, *parent_id = NULL;
+        struct css_id *child_id, *parent_id;
        subsys_id = ss->subsys_id;
        parent_css = parent->subsys[subsys_id];
        child_css = child->subsys[subsys_id];
-        depth = css_depth(parent_css) + 1;
        parent_id = parent_css->id;
+        depth = parent_id->depth + 1;
        child_id = get_new_cssid(ss, depth);
        if (IS_ERR(child_id))
@@ -3992,6 +4636,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
        return rcu_dereference(cssid->css);
 }
+EXPORT_SYMBOL_GPL(css_lookup);
 /**
 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-int cgroup_frozen(struct task_struct *task)
+int cgroup_freezing_or_frozen(struct task_struct *task)
 {
        struct freezer *freezer;
        enum freezer_state state;
        task_lock(task);
        freezer = task_freezer(task);
-        state = freezer->state;
+        if (!freezer->css.cgroup->parent)
+                state = CGROUP_THAWED; /* root cgroup can't be frozen */
+        else
+                state = freezer->state;
        task_unlock(task);
-        return state == CGROUP_FROZEN;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 /*
@@ -85,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
 /* Locks taken and their ordering
 * ------------------------------
- * css_set_lock
 * cgroup_mutex (AKA cgroup_lock)
- * task->alloc_lock (AKA task_lock)
 * freezer->lock
+ * css_set_lock
+ * task->alloc_lock (AKA task_lock)
 * task->sighand->siglock
 *
 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -96,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
 * freezer_create(), freezer_destroy():
 * cgroup_mutex [ by cgroup core ]
 *
- * can_attach():
+ * freezer_can_attach():
- * cgroup_mutex
+ * cgroup_mutex (held by caller of can_attach)
 *
- * cgroup_frozen():
+ * cgroup_freezing_or_frozen():
 * task->alloc_lock (to get task's cgroup)
 *
 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * task->alloc_lock (to get task's cgroup)
 * freezer->lock
 *  sighand->siglock (if the cgroup is freezing)
 *
 * freezer_read():
 * cgroup_mutex
 *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
 *   read_lock css_set_lock (cgroup iterator start)
 *
 * freezer_write() (freeze):
 * cgroup_mutex
 *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
 *   read_lock css_set_lock (cgroup iterator start)
- *    sighand->siglock
+ *    sighand->siglock (fake signal delivery inside freeze_task())
 *
 * freezer_write() (unfreeze):
 * cgroup_mutex
 *  freezer->lock
+ *   write_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock
 *   read_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock (to prevent races with freeze_task())
+ *    task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
 *     sighand->siglock
 */
 static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -201,9 +210,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
         * No lock is needed, since the task isn't on tasklist yet,
         * so it can't be moved to another cgroup, which means the
         * freezer won't be removed and will be valid during this
-         * function call.
+         * function call.  Nevertheless, apply RCU read-side critical
+         * section to suppress RCU lockdep false positives.
         */
+        rcu_read_lock();
        freezer = task_freezer(task);
+        rcu_read_unlock();
        /*
         * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/times.h>
 #include <linux/ptrace.h>
+#include <linux/gfp.h>
 #include <asm/uaccess.h>
@@ -494,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
 {
        int ret;
        cpumask_var_t mask;
-        unsigned long *k;
-        unsigned int min_length = cpumask_size();
-        if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
-                min_length = sizeof(compat_ulong_t);
-        if (len < min_length)
+        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+                return -EINVAL;
+        if (len & (sizeof(compat_ulong_t)-1))
                return -EINVAL;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
                return -ENOMEM;
        ret = sched_getaffinity(pid, mask);
-        if (ret < 0)
+        if (ret == 0) {
-                goto out;
+                size_t retlen = min_t(size_t, len, cpumask_size());
-        k = cpumask_bits(mask);
+                if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
-        ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
+                        ret = -EFAULT;
-        if (ret == 0)
+                else
-                ret = min_length;
+                        ret = retlen;
+        }
-out:
        free_cpumask_var(mask);
        return ret;
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee940..97d1b426a4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,18 +14,35 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <linux/gfp.h>
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
-static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ */
+void cpu_maps_update_begin(void)
+{
+        mutex_lock(&cpu_add_remove_lock);
+}
+void cpu_maps_update_done(void)
+{
+        mutex_unlock(&cpu_add_remove_lock);
+}
+static RAW_NOTIFIER_HEAD(cpu_chain);
 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 * Should always be manipulated under cpu_add_remove_lock
 */
 static int cpu_hotplug_disabled;
+#ifdef CONFIG_HOTPLUG_CPU
 static struct {
        struct task_struct *active_writer;
        struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -40,8 +57,6 @@ static struct {
        .refcount = 0,
 };
-#ifdef CONFIG_HOTPLUG_CPU
 void get_online_cpus(void)
 {
        might_sleep();
@@ -66,22 +81,6 @@ void put_online_cpus(void)
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
-#endif  /* CONFIG_HOTPLUG_CPU */
-/*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
- */
-void cpu_maps_update_begin(void)
-{
-        mutex_lock(&cpu_add_remove_lock);
-}
-void cpu_maps_update_done(void)
-{
-        mutex_unlock(&cpu_add_remove_lock);
-}
 /*
 * This ensures that the hotplug operation can begin only when the
 * refcount goes to zero.
@@ -123,6 +122,12 @@ static void cpu_hotplug_done(void)
        cpu_hotplug.active_writer = NULL;
        mutex_unlock(&cpu_hotplug.lock);
 }
+#else /* #if CONFIG_HOTPLUG_CPU */
+static void cpu_hotplug_begin(void) {}
+static void cpu_hotplug_done(void) {}
+#endif  /* #esle #if CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
 {
@@ -133,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
        return ret;
 }
+static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+                        int *nr_calls)
+{
+        int ret;
+        ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+                                        nr_calls);
+        return notifier_to_errno(ret);
+}
+static int cpu_notify(unsigned long val, void *v)
+{
+        return __cpu_notify(val, v, -1, NULL);
+}
 #ifdef CONFIG_HOTPLUG_CPU
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+        BUG_ON(cpu_notify(val, v));
+}
 EXPORT_SYMBOL(register_cpu_notifier);
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -151,18 +177,19 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
-                if (task_cpu(p) == cpu &&
+                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
                    (!cputime_eq(p->utime, cputime_zero) ||
                     !cputime_eq(p->stime, cputime_zero)))
-                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
-                                (state = %ld, flags = %x) \n",
+                                "(state = %ld, flags = %x)\n",
-                                 p->comm, task_pid_nr(p), cpu,
+                                p->comm, task_pid_nr(p), cpu,
-                                 p->state, p->flags);
+                                p->state, p->flags);
        }
        write_unlock_irq(&tasklist_lock);
 }
 struct take_cpu_down_param {
+        struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -171,6 +198,7 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
+        unsigned int cpu = (unsigned long)param->hcpu;
        int err;
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -178,9 +206,10 @@ static int __ref take_cpu_down(void *_param)
        if (err < 0)
                return err;
-        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+        cpu_notify(CPU_DYING | param->mod, param->hcpu);
-                                param->hcpu);
+        if (task_cpu(param->caller) == cpu)
+                move_task_off_dead_cpu(cpu, param->caller);
        /* Force idle task to run as soon as we yield: it should
           immediately notice cpu is offline and die quickly. */
        sched_idle_next();
@@ -191,10 +220,10 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
-        cpumask_var_t old_allowed;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
+                .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -205,38 +234,26 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        if (!cpu_online(cpu))
                return -EINVAL;
-        if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-                return -ENOMEM;
        cpu_hotplug_begin();
        set_cpu_active(cpu, false);
-        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
+        err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
-                                        hcpu, -1, &nr_calls);
+        if (err) {
-        if (err == NOTIFY_BAD) {
                set_cpu_active(cpu, true);
                nr_calls--;
-                __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+                __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
-                                          hcpu, nr_calls, NULL);
                printk("%s: attempt to take down CPU %u failed\n",
                                __func__, cpu);
-                err = -EINVAL;
                goto out_release;
        }
-        /* Ensure that we are not runnable on dying cpu */
-        cpumask_copy(old_allowed, &current->cpus_allowed);
-        set_cpus_allowed_ptr(current, cpu_active_mask);
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                set_cpu_active(cpu, true);
                /* CPU didn't die: tell everyone.  Can't complain. */
-                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+                cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-                                            hcpu) == NOTIFY_BAD)
-                        BUG();
-                goto out_allowed;
+                goto out_release;
        }
        BUG_ON(cpu_online(cpu));
@@ -248,22 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        __cpu_die(cpu);
        /* CPU is completely dead: tell everyone.  Too late to complain. */
-        if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
+        cpu_notify_nofail(CPU_DEAD | mod, hcpu);
-                                    hcpu) == NOTIFY_BAD)
-                BUG();
        check_for_tasks(cpu);
-out_allowed:
-        set_cpus_allowed_ptr(current, old_allowed);
 out_release:
        cpu_hotplug_done();
-        if (!err) {
+        if (!err)
-                if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+                cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
-                                            hcpu) == NOTIFY_BAD)
-                        BUG();
-        }
-        free_cpumask_var(old_allowed);
        return err;
 }
@@ -271,9 +280,6 @@ int __ref cpu_down(unsigned int cpu)
 {
        int err;
-        err = stop_machine_create();
-        if (err)
-                return err;
        cpu_maps_update_begin();
        if (cpu_hotplug_disabled) {
@@ -285,7 +291,6 @@ int __ref cpu_down(unsigned int cpu)
 out:
        cpu_maps_update_done();
-        stop_machine_destroy();
        return err;
 }
 EXPORT_SYMBOL(cpu_down);
@@ -302,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
                return -EINVAL;
        cpu_hotplug_begin();
-        ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
+        ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
-                                                        -1, &nr_calls);
+        if (ret) {
-        if (ret == NOTIFY_BAD) {
                nr_calls--;
                printk("%s: attempt to bring up CPU %u failed\n",
                                __func__, cpu);
-                ret = -EINVAL;
                goto out_notify;
        }
@@ -321,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        set_cpu_active(cpu, true);
        /* Now call notifier in preparation. */
-        raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
+        cpu_notify(CPU_ONLINE | mod, hcpu);
 out_notify:
        if (ret != 0)
-                __raw_notifier_call_chain(&cpu_chain,
+                __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
-                                CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
        cpu_hotplug_done();
        return ret;
@@ -335,16 +337,44 @@ out_notify:
 int __cpuinit cpu_up(unsigned int cpu)
 {
        int err = 0;
+#ifdef  CONFIG_MEMORY_HOTPLUG
+        int nid;
+        pg_data_t       *pgdat;
+#endif
        if (!cpu_possible(cpu)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_IA64)
                printk(KERN_ERR "please check additional_cpus= boot "
                                "parameter\n");
 #endif
                return -EINVAL;
        }
+#ifdef  CONFIG_MEMORY_HOTPLUG
+        nid = cpu_to_node(cpu);
+        if (!node_online(nid)) {
+                err = mem_online_node(nid);
+                if (err)
+                        return err;
+        }
+        pgdat = NODE_DATA(nid);
+        if (!pgdat) {
+                printk(KERN_ERR
+                        "Can't online cpu %d due to NULL pgdat\n", cpu);
+                return -ENOMEM;
+        }
+        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
+                mutex_lock(&zonelists_mutex);
+                build_all_zonelists(NULL);
+                mutex_unlock(&zonelists_mutex);
+        }
+#endif
        cpu_maps_update_begin();
        if (cpu_hotplug_disabled) {
@@ -364,11 +394,8 @@ static cpumask_var_t frozen_cpus;
 int disable_nonboot_cpus(void)
 {
-        int cpu, first_cpu, error;
+        int cpu, first_cpu, error = 0;
-        error = stop_machine_create();
-        if (error)
-                return error;
        cpu_maps_update_begin();
        first_cpu = cpumask_first(cpu_online_mask);
        /*
@@ -399,7 +426,6 @@ int disable_nonboot_cpus(void)
                printk(KERN_ERR "Non-boot CPUs are not disabled\n");
        }
        cpu_maps_update_done();
-        stop_machine_destroy();
        return error;
 }
@@ -466,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
        if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
                val = CPU_STARTING_FROZEN;
 #endif /* CONFIG_PM_SLEEP_SMP */
-        raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+        cpu_notify(val, (void *)(long)cpu);
 }
 #endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..02b9611eadde 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    call to guarantee_online_mems(), as we know no one is changing
 *    our task's cpuset.
 *
- *    Hold callback_mutex around the two modifications of our tasks
- *    mems_allowed to synchronize with cpuset_mems_allowed().
- *
 *    While the mm_struct we are migrating is typically from some
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
@@ -949,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
 * we structure updates as setting all new allowed nodes, then clearing newly
 * disallowed ones.
- *
- * Called with task's alloc_lock held
 */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
 {
+repeat:
+        /*
+         * Allow tasks that have access to memory reserves because they have
+         * been OOM killed to get memory anywhere.
+         */
+        if (unlikely(test_thread_flag(TIF_MEMDIE)))
+                return;
+        if (current->flags & PF_EXITING) /* Let dying task have memory */
+                return;
+        task_lock(tsk);
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-        mpol_rebind_task(tsk, &tsk->mems_allowed);
+        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-        mpol_rebind_task(tsk, newmems);
+        /*
+         * ensure checking ->mems_allowed_change_disable after setting all new
+         * allowed nodes.
+         *
+         * the read-side task can see an nodemask with new allowed nodes and
+         * old allowed nodes. and if it allocates page when cpuset clears newly
+         * disallowed ones continuous, it can see the new allowed bits.
+         *
+         * And if setting all new allowed nodes is after the checking, setting
+         * all new allowed nodes and clearing newly disallowed ones will be done
+         * continuous, and the read-side task may find no node to alloc page.
+         */
+        smp_mb();
+        /*
+         * Allocation of memory is very fast, we needn't sleep when waiting
+         * for the read-side.
+         */
+        while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+                task_unlock(tsk);
+                if (!task_curr(tsk))
+                        yield();
+                goto repeat;
+        }
+        /*
+         * ensure checking ->mems_allowed_change_disable before clearing all new
+         * disallowed nodes.
+         *
+         * if clearing newly disallowed bits before the checking, the read-side
+         * task may find no node to alloc page.
+         */
+        smp_mb();
+        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
+        task_unlock(tsk);
 }
 /*
@@ -973,14 +1016,17 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        nodemask_t newmems;
+        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+        if (!newmems)
+                return;
        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, &newmems);
+        guarantee_online_mems(cs, newmems);
-        task_lock(p);
+        cpuset_change_task_nodemask(p, newmems);
-        cpuset_change_task_nodemask(p, &newmems);
-        task_unlock(p);
+        NODEMASK_FREE(newmems);
        mm = get_task_mm(p);
        if (!mm)
@@ -1051,16 +1097,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
 {
-        nodemask_t oldmem;
+        NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
        int retval;
        struct ptr_heap heap;
+        if (!oldmem)
+                return -ENOMEM;
        /*
         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
         * it's read-only
         */
-        if (cs == &top_cpuset)
+        if (cs == &top_cpuset) {
-                return -EACCES;
+                retval = -EACCES;
+                goto done;
+        }
        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1127,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                if (!nodes_subset(trialcs->mems_allowed,
-                                node_states[N_HIGH_MEMORY]))
+                                node_states[N_HIGH_MEMORY])) {
-                        return -EINVAL;
+                        retval =  -EINVAL;
+                        goto done;
+                }
        }
-        oldmem = cs->mems_allowed;
+        *oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+        if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
                goto done;
        }
@@ -1096,10 +1149,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        update_tasks_nodemask(cs, &oldmem, &heap);
+        update_tasks_nodemask(cs, oldmem, &heap);
        heap_free(&heap);
 done:
+        NODEMASK_FREE(oldmem);
        return retval;
 }
@@ -1373,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        WARN_ON_ONCE(err);
-        task_lock(tsk);
        cpuset_change_task_nodemask(tsk, to);
-        task_unlock(tsk);
        cpuset_update_task_spread_flag(cs, tsk);
 }
@@ -1384,40 +1436,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
                          struct cgroup *oldcont, struct task_struct *tsk,
                          bool threadgroup)
 {
-        nodemask_t from, to;
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
+        NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
+        if (from == NULL || to == NULL)
+                goto alloc_fail;
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
-                to = node_possible_map;
        } else {
                guarantee_online_cpus(cs, cpus_attach);
-                guarantee_online_mems(cs, &to);
        }
+        guarantee_online_mems(cs, to);
        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, &to, cs);
+        cpuset_attach_task(tsk, to, cs);
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, &to, cs);
+                        cpuset_attach_task(c, to, cs);
                }
                rcu_read_unlock();
        }
        /* change mm; only needs to be done once even if threadgroup */
-        from = oldcs->mems_allowed;
+        *from = oldcs->mems_allowed;
-        to = cs->mems_allowed;
+        *to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, &to);
+                mpol_rebind_mm(mm, to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, &from, &to);
+                        cpuset_migrate_mm(mm, from, to);
                mmput(mm);
        }
+alloc_fail:
+        NODEMASK_FREE(from);
+        NODEMASK_FREE(to);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1621,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-        nodemask_t mask;
+        NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+        int retval;
+        if (mask == NULL)
+                return -ENOMEM;
        mutex_lock(&callback_mutex);
-        mask = cs->mems_allowed;
+        *mask = cs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        return nodelist_scnprintf(page, PAGE_SIZE, mask);
+        retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+        NODEMASK_FREE(mask);
+        return retval;
 }
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2064,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
        struct cgroup *cont;
-        nodemask_t oldmems;
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return;
        list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2014,7 +2084,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
-                oldmems = cp->mems_allowed;
+                *oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
@@ -2030,9 +2100,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems, NULL);
+                        update_tasks_nodemask(cp, oldmems, NULL);
                }
        }
+        NODEMASK_FREE(oldmems);
 }
 /*
@@ -2090,20 +2161,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return NOTIFY_DONE;
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-        case MEM_OFFLINE:
+                *oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
-                if (action == MEM_OFFLINE)
+                update_tasks_nodemask(&top_cpuset, oldmems, NULL);
-                        scan_for_empty_cpusets(&top_cpuset);
+                break;
+        case MEM_OFFLINE:
+                /*
+                 * needn't update top_cpuset.mems_allowed explicitly because
+                 * scan_for_empty_cpusets() will update it.
+                 */
+                scan_for_empty_cpusets(&top_cpuset);
                break;
        default:
                break;
        }
        cgroup_unlock();
+        NODEMASK_FREE(oldmems);
        return NOTIFY_OK;
 }
 #endif
@@ -2140,19 +2224,52 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
        mutex_lock(&callback_mutex);
-        cpuset_cpus_allowed_locked(tsk, pmask);
+        task_lock(tsk);
+        guarantee_online_cpus(task_cs(tsk), pmask);
+        task_unlock(tsk);
        mutex_unlock(&callback_mutex);
 }
-/**
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
 {
-        task_lock(tsk);
+        const struct cpuset *cs;
-        guarantee_online_cpus(task_cs(tsk), pmask);
+        int cpu;
-        task_unlock(tsk);
+        rcu_read_lock();
+        cs = task_cs(tsk);
+        if (cs)
+                cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+        rcu_read_unlock();
+        /*
+         * We own tsk->cpus_allowed, nobody can change it under us.
+         *
+         * But we used cs && cs->cpus_allowed lockless and thus can
+         * race with cgroup_attach_task() or update_cpumask() and get
+         * the wrong tsk->cpus_allowed. However, both cases imply the
+         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+         * which takes task_rq_lock().
+         *
+         * If we are called after it dropped the lock we must see all
+         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+         * set any mask even if it is not right from task_cs() pov,
+         * the pending set_cpus_allowed_ptr() will fix things.
+         */
+        cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+        if (cpu >= nr_cpu_ids) {
+                /*
+                 * Either tsk->cpus_allowed is wrong (see above) or it
+                 * is actually empty. The latter case is only possible
+                 * if we are racing with remove_tasks_in_empty_cpuset().
+                 * Like above we can temporary set any mask and rely on
+                 * set_cpus_allowed_ptr() as synchronization point.
+                 */
+                cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+                cpu = cpumask_any(cpu_active_mask);
+        }
+        return cpu;
 }
 void cpuset_init_current_mems_allowed(void)
@@ -2341,22 +2458,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 }
 /**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-void cpuset_lock(void)
-{
-        mutex_lock(&callback_mutex);
-}
-/**
 * cpuset_unlock - release lock on cpuset changes
 *
 * Undo the lock taken in a previous cpuset_lock() call.
@@ -2368,7 +2469,8 @@ void cpuset_unlock(void)
 }
 /**
- * cpuset_mem_spread_node() - On which node to begin search for a page
+ * cpuset_mem_spread_node() - On which node to begin search for a file page
+ * cpuset_slab_spread_node() - On which node to begin search for a slab page
 *
 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
 * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2393,16 +2495,27 @@ void cpuset_unlock(void)
 * See kmem_cache_alloc_node().
 */
-int cpuset_mem_spread_node(void)
+static int cpuset_spread_node(int *rotor)
 {
        int node;
-        node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+        node = next_node(*rotor, current->mems_allowed);
        if (node == MAX_NUMNODES)
                node = first_node(current->mems_allowed);
-        current->cpuset_mem_spread_rotor = node;
+        *rotor = node;
        return node;
 }
+int cpuset_mem_spread_node(void)
+{
+        return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
+}
+int cpuset_slab_spread_node(void)
+{
+        return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
+}
 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
 /**
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
-        sched_move_task(p);
-#endif  /* CONFIG_USER_SCHED */
-}
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..60bc8b1e32e6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,13 +10,13 @@
 */
 #include <linux/module.h>
 #include <linux/cred.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/key.h>
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
-#include "cred-internals.h"
 #if 0
 #define kdebug(FMT, ...) \
@@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk)
        }
 }
+/**
+ * get_task_cred - Get another task's objective credentials
+ * @task: The task to query
+ *
+ * Get the objective credentials of a task, pinning them so that they can't go
+ * away.  Accessing a task's credentials directly is not permitted.
+ *
+ * The caller must also make sure task doesn't get deleted, either by holding a
+ * ref on task or by holding tasklist_lock to prevent it from being unlinked.
+ */
+const struct cred *get_task_cred(struct task_struct *task)
+{
+        const struct cred *cred;
+        rcu_read_lock();
+        do {
+                cred = __task_cred((task));
+                BUG_ON(!cred);
+        } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+        rcu_read_unlock();
+        return cred;
+}
 /*
 * Allocate blank credentials, such that the credentials can be filled in at a
 * later date without risk of ENOMEM.
@@ -224,7 +249,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_KEYS
        new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
        if (!new->tgcred) {
-                kfree(new);
+                kmem_cache_free(cred_jar, new);
                return NULL;
        }
        atomic_set(&new->tgcred->usage, 1);
@@ -347,60 +372,6 @@ struct cred *prepare_exec_creds(void)
 }
 /*
- * prepare new credentials for the usermode helper dispatcher
- */
-struct cred *prepare_usermodehelper_creds(void)
-{
-#ifdef CONFIG_KEYS
-        struct thread_group_cred *tgcred = NULL;
-#endif
-        struct cred *new;
-#ifdef CONFIG_KEYS
-        tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
-        if (!tgcred)
-                return NULL;
-#endif
-        new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
-        if (!new)
-                return NULL;
-        kdebug("prepare_usermodehelper_creds() alloc %p", new);
-        memcpy(new, &init_cred, sizeof(struct cred));
-        atomic_set(&new->usage, 1);
-        set_cred_subscribers(new, 0);
-        get_group_info(new->group_info);
-        get_uid(new->user);
-#ifdef CONFIG_KEYS
-        new->thread_keyring = NULL;
-        new->request_key_auth = NULL;
-        new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
-        atomic_set(&tgcred->usage, 1);
-        spin_lock_init(&tgcred->lock);
-        new->tgcred = tgcred;
-#endif
-#ifdef CONFIG_SECURITY
-        new->security = NULL;
-#endif
-        if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
-                goto error;
-        validate_creds(new);
-        BUG_ON(atomic_read(&new->usage) != 1);
-        return new;
-error:
-        put_cred(new);
-        return NULL;
-}
-/*
 * Copy credentials for the new process created by fork()
 *
 * We share if we can, but under some circumstances we have to generate a new
@@ -516,8 +487,6 @@ int commit_creds(struct cred *new)
 #endif
        BUG_ON(atomic_read(&new->usage) < 1);
-        security_commit_creds(new, old);
        get_cred(new); /* we will require a ref for the subj creds too */
        /* dumpability changes */
@@ -553,8 +522,6 @@ int commit_creds(struct cred *new)
                atomic_dec(&old->user->processes);
        alter_cred_subscribers(old, -2);
-        sched_switch_user(task);
        /* send notifications */
        if (new->uid   != old->uid  ||
            new->euid  != old->euid ||
@@ -786,8 +753,6 @@ bool creds_are_invalid(const struct cred *cred)
 {
        if (cred->magic != CRED_MAGIC)
                return true;
-        if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
-                return true;
 #ifdef CONFIG_SECURITY_SELINUX
        if (selinux_is_enabled()) {
                if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..a85edc339985
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for the linux kernel debugger
+#
+obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
+obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..8bc5eeffec8a
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,983 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ *  Jason Wessel ( jason.wessel@windriver.com )
+ *  George Anzinger <george@mvista.com>
+ *  Anurekh Saxena (anurekh.saxena@timesys.com)
+ *  Lake Stevens Instrument Division (Glenn Engel)
+ *  Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include "debug_core.h"
+static int kgdb_break_asap;
+struct debuggerinfo_struct kgdb_info[NR_CPUS];
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int                             kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+/* All the KGDB handlers are installed */
+int                     kgdb_io_module_registered;
+/* Guard for recursive entry */
+static int                      exception_level;
+struct kgdb_io          *dbg_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+/* Flag for alternate operations for early debugging */
+bool dbg_is_early = true;
+/* Next cpu to become the master debug core */
+int dbg_switch_cpu;
+/* Use kdb or gdbserver mode */
+int dbg_kdb_mode = 1;
+static int __init opt_kgdb_con(char *str)
+{
+        kgdb_use_con = 1;
+        return 0;
+}
+early_param("kgdbcon", opt_kgdb_con);
+module_param(kgdb_use_con, int, 0644);
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt         kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+        [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t                        kgdb_active = ATOMIC_INIT(-1);
+EXPORT_SYMBOL_GPL(kgdb_active);
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t                 passive_cpu_wait[NR_CPUS];
+static atomic_t                 cpu_in_kgdb[NR_CPUS];
+static atomic_t                 kgdb_break_tasklet_var;
+atomic_t                        kgdb_setting_breakpoint;
+struct task_struct              *kgdb_usethread;
+struct task_struct              *kgdb_contthread;
+int                             kgdb_single_step;
+static pid_t                    kgdb_sstep_pid;
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t                        kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+static int kgdb_do_roundup = 1;
+static int __init opt_nokgdbroundup(char *str)
+{
+        kgdb_do_roundup = 0;
+        return 0;
+}
+early_param("nokgdbroundup", opt_nokgdbroundup);
+/*
+ * Finally, some KGDB code :-)
+ */
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+{
+        int err;
+        err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+        if (err)
+                return err;
+        return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+                                  BREAK_INSTR_SIZE);
+}
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+{
+        return probe_kernel_write((char *)addr,
+                                  (char *)bundle, BREAK_INSTR_SIZE);
+}
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+        char tmp_variable[BREAK_INSTR_SIZE];
+        int err;
+        /* Validate setting the breakpoint and then removing it.  In the
+         * remove fails, the kernel needs to emit a bad message because we
+         * are deep trouble not being able to put things back the way we
+         * found them.
+         */
+        err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+        if (err)
+                return err;
+        err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+        if (err)
+                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+                   "memory destroyed at: %lx", addr);
+        return err;
+}
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+        return instruction_pointer(regs);
+}
+int __weak kgdb_arch_init(void)
+{
+        return 0;
+}
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+        return 0;
+}
+/**
+ *      kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ *      @regs: Current &struct pt_regs.
+ *
+ *      This function will be called if the particular architecture must
+ *      disable hardware debugging while it is processing gdb packets or
+ *      handling exception.
+ */
+void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+}
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+        if (!CACHE_FLUSH_IS_SAFE)
+                return;
+        if (current->mm && current->mm->mmap_cache) {
+                flush_cache_range(current->mm->mmap_cache,
+                                  addr, addr + BREAK_INSTR_SIZE);
+        }
+        /* Force flush instruction cache if it was outside the mm */
+        flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+/*
+ * SW breakpoint management:
+ */
+int dbg_activate_sw_breakpoints(void)
+{
+        unsigned long addr;
+        int error;
+        int ret = 0;
+        int i;
+        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                if (kgdb_break[i].state != BP_SET)
+                        continue;
+                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_set_breakpoint(addr,
+                                kgdb_break[i].saved_instr);
+                if (error) {
+                        ret = error;
+                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+                        continue;
+                }
+                kgdb_flush_swbreak_addr(addr);
+                kgdb_break[i].state = BP_ACTIVE;
+        }
+        return ret;
+}
+int dbg_set_sw_break(unsigned long addr)
+{
+        int err = kgdb_validate_break_address(addr);
+        int breakno = -1;
+        int i;
+        if (err)
+                return err;
+        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                if ((kgdb_break[i].state == BP_SET) &&
+                                        (kgdb_break[i].bpt_addr == addr))
+                        return -EEXIST;
+        }
+        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                if (kgdb_break[i].state == BP_REMOVED &&
+                                        kgdb_break[i].bpt_addr == addr) {
+                        breakno = i;
+                        break;
+                }
+        }
+        if (breakno == -1) {
+                for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                        if (kgdb_break[i].state == BP_UNDEFINED) {
+                                breakno = i;
+                                break;
+                        }
+                }
+        }
+        if (breakno == -1)
+                return -E2BIG;
+        kgdb_break[breakno].state = BP_SET;
+        kgdb_break[breakno].type = BP_BREAKPOINT;
+        kgdb_break[breakno].bpt_addr = addr;
+        return 0;
+}
+int dbg_deactivate_sw_breakpoints(void)
+{
+        unsigned long addr;
+        int error;
+        int ret = 0;
+        int i;
+        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                if (kgdb_break[i].state != BP_ACTIVE)
+                        continue;
+                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_remove_breakpoint(addr,
+                                        kgdb_break[i].saved_instr);
+                if (error) {
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+                        ret = error;
+                }
+                kgdb_flush_swbreak_addr(addr);
+                kgdb_break[i].state = BP_SET;
+        }
+        return ret;
+}
+int dbg_remove_sw_break(unsigned long addr)
+{
+        int i;
+        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                if ((kgdb_break[i].state == BP_SET) &&
+                                (kgdb_break[i].bpt_addr == addr)) {
+                        kgdb_break[i].state = BP_REMOVED;
+                        return 0;
+                }
+        }
+        return -ENOENT;
+}
+int kgdb_isremovedbreak(unsigned long addr)
+{
+        int i;
+        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                if ((kgdb_break[i].state == BP_REMOVED) &&
+                                        (kgdb_break[i].bpt_addr == addr))
+                        return 1;
+        }
+        return 0;
+}
+int dbg_remove_all_break(void)
+{
+        unsigned long addr;
+        int error;
+        int i;
+        /* Clear memory breakpoints. */
+        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+                if (kgdb_break[i].state != BP_ACTIVE)
+                        goto setundefined;
+                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_remove_breakpoint(addr,
+                                kgdb_break[i].saved_instr);
+                if (error)
+                        printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+                           addr);
+setundefined:
+                kgdb_break[i].state = BP_UNDEFINED;
+        }
+        /* Clear hardware breakpoints. */
+        if (arch_kgdb_ops.remove_all_hw_break)
+                arch_kgdb_ops.remove_all_hw_break();
+        return 0;
+}
+/*
+ * Return true if there is a valid kgdb I/O module.  Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+        if (!dbg_io_ops)
+                return 0;
+        if (kgdb_connected)
+                return 1;
+        if (atomic_read(&kgdb_setting_breakpoint))
+                return 1;
+        if (print_wait) {
+#ifdef CONFIG_KGDB_KDB
+                if (!dbg_kdb_mode)
+                        printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
+#else
+                printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+#endif
+        }
+        return 1;
+}
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+        unsigned long addr;
+        if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+                return 0;
+        /* Panic on recursive debugger calls: */
+        exception_level++;
+        addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+        dbg_deactivate_sw_breakpoints();
+        /*
+         * If the break point removed ok at the place exception
+         * occurred, try to recover and print a warning to the end
+         * user because the user planted a breakpoint in a place that
+         * KGDB needs in order to function.
+         */
+        if (dbg_remove_sw_break(addr) == 0) {
+                exception_level = 0;
+                kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+                dbg_activate_sw_breakpoints();
+                printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+                        addr);
+                WARN_ON_ONCE(1);
+                return 1;
+        }
+        dbg_remove_all_break();
+        kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+        if (exception_level > 1) {
+                dump_stack();
+                panic("Recursive entry to debugger");
+        }
+        printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+#ifdef CONFIG_KGDB_KDB
+        /* Allow kdb to debug itself one level */
+        return 0;
+#endif
+        dump_stack();
+        panic("Recursive entry to debugger");
+        return 1;
+}
+static void dbg_cpu_switch(int cpu, int next_cpu)
+{
+        /* Mark the cpu we are switching away from as a slave when it
+         * holds the kgdb_active token.  This must be done so that the
+         * that all the cpus wait in for the debug core will not enter
+         * again as the master. */
+        if (cpu == atomic_read(&kgdb_active)) {
+                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+                kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
+        }
+        kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
+}
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
+{
+        unsigned long flags;
+        int sstep_tries = 100;
+        int error;
+        int i, cpu;
+        int trace_on = 0;
+acquirelock:
+        /*
+         * Interrupts will be restored by the 'trap return' code, except when
+         * single stepping.
+         */
+        local_irq_save(flags);
+        cpu = ks->cpu;
+        kgdb_info[cpu].debuggerinfo = regs;
+        kgdb_info[cpu].task = current;
+        kgdb_info[cpu].ret_state = 0;
+        kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
+        /*
+         * Make sure the above info reaches the primary CPU before
+         * our cpu_in_kgdb[] flag setting does:
+         */
+        atomic_inc(&cpu_in_kgdb[cpu]);
+        if (exception_level == 1)
+                goto cpu_master_loop;
+        /*
+         * CPU will loop if it is a slave or request to become a kgdb
+         * master cpu and acquire the kgdb_active lock:
+         */
+        while (1) {
+cpu_loop:
+                if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
+                        kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
+                        goto cpu_master_loop;
+                } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+                        if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+                                break;
+                } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+                        if (!atomic_read(&passive_cpu_wait[cpu]))
+                                goto return_normal;
+                } else {
+return_normal:
+                        /* Return to normal operation by executing any
+                         * hw breakpoint fixup.
+                         */
+                        if (arch_kgdb_ops.correct_hw_break)
+                                arch_kgdb_ops.correct_hw_break();
+                        if (trace_on)
+                                tracing_on();
+                        atomic_dec(&cpu_in_kgdb[cpu]);
+                        touch_softlockup_watchdog_sync();
+                        clocksource_touch_watchdog();
+                        local_irq_restore(flags);
+                        return 0;
+                }
+                cpu_relax();
+        }
+        /*
+         * For single stepping, try to only enter on the processor
+         * that was single stepping.  To gaurd against a deadlock, the
+         * kernel will only try for the value of sstep_tries before
+         * giving up and continuing on.
+         */
+        if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+            (kgdb_info[cpu].task &&
+             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
+                atomic_set(&kgdb_active, -1);
+                touch_softlockup_watchdog_sync();
+                clocksource_touch_watchdog();
+                local_irq_restore(flags);
+                goto acquirelock;
+        }
+        if (!kgdb_io_ready(1)) {
+                kgdb_info[cpu].ret_state = 1;
+                goto kgdb_restore; /* No I/O connection, resume the system */
+        }
+        /*
+         * Don't enter if we have hit a removed breakpoint.
+         */
+        if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+                goto kgdb_restore;
+        /* Call the I/O driver's pre_exception routine */
+        if (dbg_io_ops->pre_exception)
+                dbg_io_ops->pre_exception();
+        kgdb_disable_hw_debug(ks->linux_regs);
+        /*
+         * Get the passive CPU lock which will hold all the non-primary
+         * CPU in a spin state while the debugger is active
+         */
+        if (!kgdb_single_step) {
+                for (i = 0; i < NR_CPUS; i++)
+                        atomic_inc(&passive_cpu_wait[i]);
+        }
+#ifdef CONFIG_SMP
+        /* Signal the other CPUs to enter kgdb_wait() */
+        if ((!kgdb_single_step) && kgdb_do_roundup)
+                kgdb_roundup_cpus(flags);
+#endif
+        /*
+         * Wait for the other CPUs to be notified and be waiting for us:
+         */
+        for_each_online_cpu(i) {
+                while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
+                        cpu_relax();
+        }
+        /*
+         * At this point the primary processor is completely
+         * in the debugger and all secondary CPUs are quiescent
+         */
+        dbg_deactivate_sw_breakpoints();
+        kgdb_single_step = 0;
+        kgdb_contthread = current;
+        exception_level = 0;
+        trace_on = tracing_is_on();
+        if (trace_on)
+                tracing_off();
+        while (1) {
+cpu_master_loop:
+                if (dbg_kdb_mode) {
+                        kgdb_connected = 1;
+                        error = kdb_stub(ks);
+                        kgdb_connected = 0;
+                } else {
+                        error = gdb_serial_stub(ks);
+                }
+                if (error == DBG_PASS_EVENT) {
+                        dbg_kdb_mode = !dbg_kdb_mode;
+                } else if (error == DBG_SWITCH_CPU_EVENT) {
+                        dbg_cpu_switch(cpu, dbg_switch_cpu);
+                        goto cpu_loop;
+                } else {
+                        kgdb_info[cpu].ret_state = error;
+                        break;
+                }
+        }
+        /* Call the I/O driver's post_exception routine */
+        if (dbg_io_ops->post_exception)
+                dbg_io_ops->post_exception();
+        atomic_dec(&cpu_in_kgdb[ks->cpu]);
+        if (!kgdb_single_step) {
+                for (i = NR_CPUS-1; i >= 0; i--)
+                        atomic_dec(&passive_cpu_wait[i]);
+                /*
+                 * Wait till all the CPUs have quit from the debugger,
+                 * but allow a CPU that hit an exception and is
+                 * waiting to become the master to remain in the debug
+                 * core.
+                 */
+                for_each_online_cpu(i) {
+                        while (kgdb_do_roundup &&
+                               atomic_read(&cpu_in_kgdb[i]) &&
+                               !(kgdb_info[i].exception_state &
+                                 DCPU_WANT_MASTER))
+                                cpu_relax();
+                }
+        }
+kgdb_restore:
+        if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+                int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+                if (kgdb_info[sstep_cpu].task)
+                        kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+                else
+                        kgdb_sstep_pid = 0;
+        }
+        if (trace_on)
+                tracing_on();
+        /* Free kgdb_active */
+        atomic_set(&kgdb_active, -1);
+        touch_softlockup_watchdog_sync();
+        clocksource_touch_watchdog();
+        local_irq_restore(flags);
+        return kgdb_info[cpu].ret_state;
+}
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ *      interface locks, if any (begin_session)
+ *      kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        int ret;
+        ks->cpu                 = raw_smp_processor_id();
+        ks->ex_vector           = evector;
+        ks->signo               = signo;
+        ks->err_code            = ecode;
+        ks->kgdb_usethreadid    = 0;
+        ks->linux_regs          = regs;
+        if (kgdb_reenter_check(ks))
+                return 0; /* Ouch, double exception ! */
+        kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+        ret = kgdb_cpu_enter(ks, regs);
+        kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
+                                                DCPU_IS_SLAVE);
+        return ret;
+}
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        memset(ks, 0, sizeof(struct kgdb_state));
+        ks->cpu                 = cpu;
+        ks->linux_regs          = regs;
+        if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+            atomic_read(&kgdb_active) != -1 &&
+            atomic_read(&kgdb_active) != cpu) {
+                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+                kgdb_cpu_enter(ks, regs);
+                kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
+                return 0;
+        }
+#endif
+        return 1;
+}
+static void kgdb_console_write(struct console *co, const char *s,
+   unsigned count)
+{
+        unsigned long flags;
+        /* If we're debugging, or KGDB has not connected, don't try
+         * and print. */
+        if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
+                return;
+        local_irq_save(flags);
+        gdbstub_msg_write(s, count);
+        local_irq_restore(flags);
+}
+static struct console kgdbcons = {
+        .name           = "kgdb",
+        .write          = kgdb_console_write,
+        .flags          = CON_PRINTBUFFER | CON_ENABLED,
+        .index          = -1,
+};
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_dbg(int key, struct tty_struct *tty)
+{
+        if (!dbg_io_ops) {
+                printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+                return;
+        }
+        if (!kgdb_connected) {
+#ifdef CONFIG_KGDB_KDB
+                if (!dbg_kdb_mode)
+                        printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
+#else
+                printk(KERN_CRIT "Entering KGDB\n");
+#endif
+        }
+        kgdb_breakpoint();
+}
+static struct sysrq_key_op sysrq_dbg_op = {
+        .handler        = sysrq_handle_dbg,
+        .help_msg       = "debug(G)",
+        .action_msg     = "DEBUG",
+};
+#endif
+static int kgdb_panic_event(struct notifier_block *self,
+                            unsigned long val,
+                            void *data)
+{
+        if (dbg_kdb_mode)
+                kdb_printf("PANIC: %s\n", (char *)data);
+        kgdb_breakpoint();
+        return NOTIFY_DONE;
+}
+static struct notifier_block kgdb_panic_event_nb = {
+       .notifier_call   = kgdb_panic_event,
+       .priority        = INT_MAX,
+};
+void __weak kgdb_arch_late(void)
+{
+}
+void __init dbg_late_init(void)
+{
+        dbg_is_early = false;
+        if (kgdb_io_module_registered)
+                kgdb_arch_late();
+        kdb_init(KDB_INIT_FULL);
+}
+static void kgdb_register_callbacks(void)
+{
+        if (!kgdb_io_module_registered) {
+                kgdb_io_module_registered = 1;
+                kgdb_arch_init();
+                if (!dbg_is_early)
+                        kgdb_arch_late();
+                atomic_notifier_chain_register(&panic_notifier_list,
+                                               &kgdb_panic_event_nb);
+#ifdef CONFIG_MAGIC_SYSRQ
+                register_sysrq_key('g', &sysrq_dbg_op);
+#endif
+                if (kgdb_use_con && !kgdb_con_registered) {
+                        register_console(&kgdbcons);
+                        kgdb_con_registered = 1;
+                }
+        }
+}
+static void kgdb_unregister_callbacks(void)
+{
+        /*
+         * When this routine is called KGDB should unregister from the
+         * panic handler and clean up, making sure it is not handling any
+         * break exceptions at the time.
+         */
+        if (kgdb_io_module_registered) {
+                kgdb_io_module_registered = 0;
+                atomic_notifier_chain_unregister(&panic_notifier_list,
+                                               &kgdb_panic_event_nb);
+                kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+                unregister_sysrq_key('g', &sysrq_dbg_op);
+#endif
+                if (kgdb_con_registered) {
+                        unregister_console(&kgdbcons);
+                        kgdb_con_registered = 0;
+                }
+        }
+}
+/*
+ * There are times a tasklet needs to be used vs a compiled in
+ * break point so as to cause an exception outside a kgdb I/O module,
+ * such as is the case with kgdboe, where calling a breakpoint in the
+ * I/O driver itself would be fatal.
+ */
+static void kgdb_tasklet_bpt(unsigned long ing)
+{
+        kgdb_breakpoint();
+        atomic_set(&kgdb_break_tasklet_var, 0);
+}
+static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
+void kgdb_schedule_breakpoint(void)
+{
+        if (atomic_read(&kgdb_break_tasklet_var) ||
+                atomic_read(&kgdb_active) != -1 ||
+                atomic_read(&kgdb_setting_breakpoint))
+                return;
+        atomic_inc(&kgdb_break_tasklet_var);
+        tasklet_schedule(&kgdb_tasklet_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
+static void kgdb_initial_breakpoint(void)
+{
+        kgdb_break_asap = 0;
+        printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+        kgdb_breakpoint();
+}
+/**
+ *      kgdb_register_io_module - register KGDB IO module
+ *      @new_dbg_io_ops: the io ops vector
+ *
+ *      Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
+{
+        int err;
+        spin_lock(&kgdb_registration_lock);
+        if (dbg_io_ops) {
+                spin_unlock(&kgdb_registration_lock);
+                printk(KERN_ERR "kgdb: Another I/O driver is already "
+                                "registered with KGDB.\n");
+                return -EBUSY;
+        }
+        if (new_dbg_io_ops->init) {
+                err = new_dbg_io_ops->init();
+                if (err) {
+                        spin_unlock(&kgdb_registration_lock);
+                        return err;
+                }
+        }
+        dbg_io_ops = new_dbg_io_ops;
+        spin_unlock(&kgdb_registration_lock);
+        printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+               new_dbg_io_ops->name);
+        /* Arm KGDB now. */
+        kgdb_register_callbacks();
+        if (kgdb_break_asap)
+                kgdb_initial_breakpoint();
+        return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+/**
+ *      kkgdb_unregister_io_module - unregister KGDB IO module
+ *      @old_dbg_io_ops: the io ops vector
+ *
+ *      Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
+{
+        BUG_ON(kgdb_connected);
+        /*
+         * KGDB is no longer able to communicate out, so
+         * unregister our callbacks and reset state.
+         */
+        kgdb_unregister_callbacks();
+        spin_lock(&kgdb_registration_lock);
+        WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
+        dbg_io_ops = NULL;
+        spin_unlock(&kgdb_registration_lock);
+        printk(KERN_INFO
+                "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+                old_dbg_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+int dbg_io_get_char(void)
+{
+        int ret = dbg_io_ops->read_char();
+        if (ret == NO_POLL_CHAR)
+                return -1;
+        if (!dbg_kdb_mode)
+                return ret;
+        if (ret == 127)
+                return 8;
+        return ret;
+}
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception.  It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+void kgdb_breakpoint(void)
+{
+        atomic_inc(&kgdb_setting_breakpoint);
+        wmb(); /* Sync point before breakpoint */
+        arch_kgdb_breakpoint();
+        wmb(); /* Sync point after breakpoint */
+        atomic_dec(&kgdb_setting_breakpoint);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+static int __init opt_kgdb_wait(char *str)
+{
+        kgdb_break_asap = 1;
+        kdb_init(KDB_INIT_EARLY);
+        if (kgdb_io_module_registered)
+                kgdb_initial_breakpoint();
+        return 0;
+}
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..c5d753d80f67
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#ifndef _DEBUG_CORE_H_
+#define _DEBUG_CORE_H_
+/*
+ * These are the private implementation headers between the kernel
+ * debugger core and the debugger front end code.
+ */
+/* kernel debug core data structures */
+struct kgdb_state {
+        int                     ex_vector;
+        int                     signo;
+        int                     err_code;
+        int                     cpu;
+        int                     pass_exception;
+        unsigned long           thr_query;
+        unsigned long           threadid;
+        long                    kgdb_usethreadid;
+        struct pt_regs          *linux_regs;
+};
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP       0x8 /* CPU is single stepping */
+struct debuggerinfo_struct {
+        void                    *debuggerinfo;
+        struct task_struct      *task;
+        int                     exception_state;
+        int                     ret_state;
+        int                     irq_depth;
+};
+extern struct debuggerinfo_struct kgdb_info[];
+/* kernel debug core break point routines */
+extern int dbg_remove_all_break(void);
+extern int dbg_set_sw_break(unsigned long addr);
+extern int dbg_remove_sw_break(unsigned long addr);
+extern int dbg_activate_sw_breakpoints(void);
+extern int dbg_deactivate_sw_breakpoints(void);
+/* polled character access to i/o module */
+extern int dbg_io_get_char(void);
+/* stub return value for switching between the gdbstub and kdb */
+#define DBG_PASS_EVENT -12345
+/* Switch from one cpu to another */
+#define DBG_SWITCH_CPU_EVENT -123456
+extern int dbg_switch_cpu;
+/* gdbstub interface functions */
+extern int gdb_serial_stub(struct kgdb_state *ks);
+extern void gdbstub_msg_write(const char *s, int len);
+/* gdbstub functions used for kdb <-> gdbstub transition */
+extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
+extern int dbg_kdb_mode;
+#ifdef CONFIG_KGDB_KDB
+extern int kdb_stub(struct kgdb_state *ks);
+extern int kdb_parse(const char *cmdstr);
+#else /* ! CONFIG_KGDB_KDB */
+static inline int kdb_stub(struct kgdb_state *ks)
+{
+        return DBG_PASS_EVENT;
+}
+#endif /* CONFIG_KGDB_KDB */
+#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..e8fd6868682d
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1014 @@
+/*
+ * Kernel Debug Core
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2009 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ *  Jason Wessel ( jason.wessel@windriver.com )
+ *  George Anzinger <george@mvista.com>
+ *  Anurekh Saxena (anurekh.saxena@timesys.com)
+ *  Lake Stevens Instrument Division (Glenn Engel)
+ *  Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/kernel.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/reboot.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/unaligned.h>
+#include "debug_core.h"
+#define KGDB_MAX_THREAD_QUERY 17
+/* Our I/O buffers. */
+static char                     remcom_in_buffer[BUFMAX];
+static char                     remcom_out_buffer[BUFMAX];
+/* Storage for the registers, in GDB format. */
+static unsigned long            gdb_regs[(NUMREGBYTES +
+                                        sizeof(unsigned long) - 1) /
+                                        sizeof(unsigned long)];
+/*
+ * GDB remote protocol parser:
+ */
+static int hex(char ch)
+{
+        if ((ch >= 'a') && (ch <= 'f'))
+                return ch - 'a' + 10;
+        if ((ch >= '0') && (ch <= '9'))
+                return ch - '0';
+        if ((ch >= 'A') && (ch <= 'F'))
+                return ch - 'A' + 10;
+        return -1;
+}
+#ifdef CONFIG_KGDB_KDB
+static int gdbstub_read_wait(void)
+{
+        int ret = -1;
+        int i;
+        /* poll any additional I/O interfaces that are defined */
+        while (ret < 0)
+                for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
+                        ret = kdb_poll_funcs[i]();
+                        if (ret > 0)
+                                break;
+                }
+        return ret;
+}
+#else
+static int gdbstub_read_wait(void)
+{
+        int ret = dbg_io_ops->read_char();
+        while (ret == NO_POLL_CHAR)
+                ret = dbg_io_ops->read_char();
+        return ret;
+}
+#endif
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+        unsigned char checksum;
+        unsigned char xmitcsum;
+        int count;
+        char ch;
+        do {
+                /*
+                 * Spin and wait around for the start character, ignore all
+                 * other characters:
+                 */
+                while ((ch = (gdbstub_read_wait())) != '$')
+                        /* nothing */;
+                kgdb_connected = 1;
+                checksum = 0;
+                xmitcsum = -1;
+                count = 0;
+                /*
+                 * now, read until a # or end of buffer is found:
+                 */
+                while (count < (BUFMAX - 1)) {
+                        ch = gdbstub_read_wait();
+                        if (ch == '#')
+                                break;
+                        checksum = checksum + ch;
+                        buffer[count] = ch;
+                        count = count + 1;
+                }
+                buffer[count] = 0;
+                if (ch == '#') {
+                        xmitcsum = hex(gdbstub_read_wait()) << 4;
+                        xmitcsum += hex(gdbstub_read_wait());
+                        if (checksum != xmitcsum)
+                                /* failed checksum */
+                                dbg_io_ops->write_char('-');
+                        else
+                                /* successful transfer */
+                                dbg_io_ops->write_char('+');
+                        if (dbg_io_ops->flush)
+                                dbg_io_ops->flush();
+                }
+        } while (checksum != xmitcsum);
+}
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+        unsigned char checksum;
+        int count;
+        char ch;
+        /*
+         * $<packet info>#<checksum>.
+         */
+        while (1) {
+                dbg_io_ops->write_char('$');
+                checksum = 0;
+                count = 0;
+                while ((ch = buffer[count])) {
+                        dbg_io_ops->write_char(ch);
+                        checksum += ch;
+                        count++;
+                }
+                dbg_io_ops->write_char('#');
+                dbg_io_ops->write_char(hex_asc_hi(checksum));
+                dbg_io_ops->write_char(hex_asc_lo(checksum));
+                if (dbg_io_ops->flush)
+                        dbg_io_ops->flush();
+                /* Now see what we get in reply. */
+                ch = gdbstub_read_wait();
+                if (ch == 3)
+                        ch = gdbstub_read_wait();
+                /* If we get an ACK, we are done. */
+                if (ch == '+')
+                        return;
+                /*
+                 * If we get the start of another packet, this means
+                 * that GDB is attempting to reconnect.  We will NAK
+                 * the packet being sent, and stop trying to send this
+                 * packet.
+                 */
+                if (ch == '$') {
+                        dbg_io_ops->write_char('-');
+                        if (dbg_io_ops->flush)
+                                dbg_io_ops->flush();
+                        return;
+                }
+        }
+}
+static char gdbmsgbuf[BUFMAX + 1];
+void gdbstub_msg_write(const char *s, int len)
+{
+        char *bufptr;
+        int wcount;
+        int i;
+        if (len == 0)
+                len = strlen(s);
+        /* 'O'utput */
+        gdbmsgbuf[0] = 'O';
+        /* Fill and send buffers... */
+        while (len > 0) {
+                bufptr = gdbmsgbuf + 1;
+                /* Calculate how many this time */
+                if ((len << 1) > (BUFMAX - 2))
+                        wcount = (BUFMAX - 2) >> 1;
+                else
+                        wcount = len;
+                /* Pack in hex chars */
+                for (i = 0; i < wcount; i++)
+                        bufptr = pack_hex_byte(bufptr, s[i]);
+                *bufptr = '\0';
+                /* Move up */
+                s += wcount;
+                len -= wcount;
+                /* Write packet */
+                put_packet(gdbmsgbuf);
+        }
+}
+/*
+ * Convert the memory pointed to by mem into hex, placing result in
+ * buf.  Return a pointer to the last char put in buf (null). May
+ * return an error.
+ */
+int kgdb_mem2hex(char *mem, char *buf, int count)
+{
+        char *tmp;
+        int err;
+        /*
+         * We use the upper half of buf as an intermediate buffer for the
+         * raw memory copy.  Hex conversion will work against this one.
+         */
+        tmp = buf + count;
+        err = probe_kernel_read(tmp, mem, count);
+        if (!err) {
+                while (count > 0) {
+                        buf = pack_hex_byte(buf, *tmp);
+                        tmp++;
+                        count--;
+                }
+                *buf = 0;
+        }
+        return err;
+}
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in
+ * mem.  Return a pointer to the character AFTER the last byte
+ * written.  May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+        char *tmp_raw;
+        char *tmp_hex;
+        /*
+         * We use the upper half of buf as an intermediate buffer for the
+         * raw memory that is converted from hex.
+         */
+        tmp_raw = buf + count * 2;
+        tmp_hex = tmp_raw - 1;
+        while (tmp_hex >= buf) {
+                tmp_raw--;
+                *tmp_raw = hex(*tmp_hex--);
+                *tmp_raw |= hex(*tmp_hex--) << 4;
+        }
+        return probe_kernel_write(mem, tmp_raw, count);
+}
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
+{
+        int hex_val;
+        int num = 0;
+        int negate = 0;
+        *long_val = 0;
+        if (**ptr == '-') {
+                negate = 1;
+                (*ptr)++;
+        }
+        while (**ptr) {
+                hex_val = hex(**ptr);
+                if (hex_val < 0)
+                        break;
+                *long_val = (*long_val << 4) | hex_val;
+                num++;
+                (*ptr)++;
+        }
+        if (negate)
+                *long_val = -*long_val;
+        return num;
+}
+/*
+ * Copy the binary array pointed to by buf into mem.  Fix $, #, and
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
+ * The input buf is overwitten with the result to write to mem.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+        int size = 0;
+        char *c = buf;
+        while (count-- > 0) {
+                c[size] = *buf++;
+                if (c[size] == 0x7d)
+                        c[size] = *buf++ ^ 0x20;
+                size++;
+        }
+        return probe_kernel_write(mem, c, size);
+}
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+        char *ptr = &remcom_in_buffer[1];
+        unsigned long addr;
+        unsigned long length;
+        int err;
+        if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+            kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+                if (binary)
+                        err = kgdb_ebin2mem(ptr, (char *)addr, length);
+                else
+                        err = kgdb_hex2mem(ptr, (char *)addr, length);
+                if (err)
+                        return err;
+                if (CACHE_FLUSH_IS_SAFE)
+                        flush_icache_range(addr, addr + length);
+                return 0;
+        }
+        return -EINVAL;
+}
+static void error_packet(char *pkt, int error)
+{
+        error = -error;
+        pkt[0] = 'E';
+        pkt[1] = hex_asc[(error / 10)];
+        pkt[2] = hex_asc[(error % 10)];
+        pkt[3] = '\0';
+}
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+#define BUF_THREAD_ID_SIZE      16
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+        char *limit;
+        limit = pkt + BUF_THREAD_ID_SIZE;
+        while (pkt < limit)
+                pkt = pack_hex_byte(pkt, *id++);
+        return pkt;
+}
+static void int_to_threadref(unsigned char *id, int value)
+{
+        unsigned char *scan;
+        int i = 4;
+        scan = (unsigned char *)id;
+        while (i--)
+                *scan++ = 0;
+        put_unaligned_be32(value, scan);
+}
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+        /*
+         * Non-positive TIDs are remapped to the cpu shadow information
+         */
+        if (tid == 0 || tid == -1)
+                tid = -atomic_read(&kgdb_active) - 2;
+        if (tid < -1 && tid > -NR_CPUS - 2) {
+                if (kgdb_info[-tid - 2].task)
+                        return kgdb_info[-tid - 2].task;
+                else
+                        return idle_task(-tid - 2);
+        }
+        if (tid <= 0) {
+                printk(KERN_ERR "KGDB: Internal thread select error\n");
+                dump_stack();
+                return NULL;
+        }
+        /*
+         * find_task_by_pid_ns() does not take the tasklist lock anymore
+         * but is nicely RCU locked - hence is a pretty resilient
+         * thing to use:
+         */
+        return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+/*
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
+ */
+static inline int shadow_pid(int realpid)
+{
+        if (realpid)
+                return realpid;
+        return -raw_smp_processor_id() - 2;
+}
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+        /*
+         * We know that this packet is only sent
+         * during initial connect.  So to be safe,
+         * we clear out our breakpoints now in case
+         * GDB is reconnecting.
+         */
+        dbg_remove_all_break();
+        remcom_out_buffer[0] = 'S';
+        pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+        struct task_struct *thread;
+        void *local_debuggerinfo;
+        int i;
+        thread = kgdb_usethread;
+        if (!thread) {
+                thread = kgdb_info[ks->cpu].task;
+                local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+        } else {
+                local_debuggerinfo = NULL;
+                for_each_online_cpu(i) {
+                        /*
+                         * Try to find the task on some other
+                         * or possibly this node if we do not
+                         * find the matching task then we try
+                         * to approximate the results.
+                         */
+                        if (thread == kgdb_info[i].task)
+                                local_debuggerinfo = kgdb_info[i].debuggerinfo;
+                }
+        }
+        /*
+         * All threads that don't have debuggerinfo should be
+         * in schedule() sleeping, since all other CPUs
+         * are in kgdb_wait, and thus have debuggerinfo.
+         */
+        if (local_debuggerinfo) {
+                pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+        } else {
+                /*
+                 * Pull stuff saved during switch_to; nothing
+                 * else is accessible (or even particularly
+                 * relevant).
+                 *
+                 * This should be enough for a stack trace.
+                 */
+                sleeping_thread_to_gdb_regs(gdb_regs, thread);
+        }
+        kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+        kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+        if (kgdb_usethread && kgdb_usethread != current) {
+                error_packet(remcom_out_buffer, -EINVAL);
+        } else {
+                gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+                strcpy(remcom_out_buffer, "OK");
+        }
+}
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+        char *ptr = &remcom_in_buffer[1];
+        unsigned long length;
+        unsigned long addr;
+        int err;
+        if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+                                        kgdb_hex2long(&ptr, &length) > 0) {
+                err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+                if (err)
+                        error_packet(remcom_out_buffer, err);
+        } else {
+                error_packet(remcom_out_buffer, -EINVAL);
+        }
+}
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+        int err = write_mem_msg(0);
+        if (err)
+                error_packet(remcom_out_buffer, err);
+        else
+                strcpy(remcom_out_buffer, "OK");
+}
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+        int err = write_mem_msg(1);
+        if (err)
+                error_packet(remcom_out_buffer, err);
+        else
+                strcpy(remcom_out_buffer, "OK");
+}
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+        int error;
+        /* The detach case */
+        if (remcom_in_buffer[0] == 'D') {
+                error = dbg_remove_all_break();
+                if (error < 0) {
+                        error_packet(remcom_out_buffer, error);
+                } else {
+                        strcpy(remcom_out_buffer, "OK");
+                        kgdb_connected = 0;
+                }
+                put_packet(remcom_out_buffer);
+        } else {
+                /*
+                 * Assume the kill case, with no exit code checking,
+                 * trying to force detach the debugger:
+                 */
+                dbg_remove_all_break();
+                kgdb_connected = 0;
+        }
+}
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+        /* For now, only honor R0 */
+        if (strcmp(remcom_in_buffer, "R0") == 0) {
+                printk(KERN_CRIT "Executing emergency reboot\n");
+                strcpy(remcom_out_buffer, "OK");
+                put_packet(remcom_out_buffer);
+                /*
+                 * Execution should not return from
+                 * machine_emergency_restart()
+                 */
+                machine_emergency_restart();
+                kgdb_connected = 0;
+                return 1;
+        }
+        return 0;
+}
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+        struct task_struct *g;
+        struct task_struct *p;
+        unsigned char thref[8];
+        char *ptr;
+        int i;
+        int cpu;
+        int finished = 0;
+        switch (remcom_in_buffer[1]) {
+        case 's':
+        case 'f':
+                if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
+                        break;
+                i = 0;
+                remcom_out_buffer[0] = 'm';
+                ptr = remcom_out_buffer + 1;
+                if (remcom_in_buffer[1] == 'f') {
+                        /* Each cpu is a shadow thread */
+                        for_each_online_cpu(cpu) {
+                                ks->thr_query = 0;
+                                int_to_threadref(thref, -cpu - 2);
+                                pack_threadid(ptr, thref);
+                                ptr += BUF_THREAD_ID_SIZE;
+                                *(ptr++) = ',';
+                                i++;
+                        }
+                }
+                do_each_thread(g, p) {
+                        if (i >= ks->thr_query && !finished) {
+                                int_to_threadref(thref, p->pid);
+                                pack_threadid(ptr, thref);
+                                ptr += BUF_THREAD_ID_SIZE;
+                                *(ptr++) = ',';
+                                ks->thr_query++;
+                                if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+                                        finished = 1;
+                        }
+                        i++;
+                } while_each_thread(g, p);
+                *(--ptr) = '\0';
+                break;
+        case 'C':
+                /* Current thread id */
+                strcpy(remcom_out_buffer, "QC");
+                ks->threadid = shadow_pid(current->pid);
+                int_to_threadref(thref, ks->threadid);
+                pack_threadid(remcom_out_buffer + 2, thref);
+                break;
+        case 'T':
+                if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
+                        break;
+                ks->threadid = 0;
+                ptr = remcom_in_buffer + 17;
+                kgdb_hex2long(&ptr, &ks->threadid);
+                if (!getthread(ks->linux_regs, ks->threadid)) {
+                        error_packet(remcom_out_buffer, -EINVAL);
+                        break;
+                }
+                if ((int)ks->threadid > 0) {
+                        kgdb_mem2hex(getthread(ks->linux_regs,
+                                        ks->threadid)->comm,
+                                        remcom_out_buffer, 16);
+                } else {
+                        static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+                        sprintf(tmpstr, "shadowCPU%d",
+                                        (int)(-ks->threadid - 2));
+                        kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+                }
+                break;
+#ifdef CONFIG_KGDB_KDB
+        case 'R':
+                if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
+                        int len = strlen(remcom_in_buffer + 6);
+                        if ((len % 2) != 0) {
+                                strcpy(remcom_out_buffer, "E01");
+                                break;
+                        }
+                        kgdb_hex2mem(remcom_in_buffer + 6,
+                                     remcom_out_buffer, len);
+                        len = len / 2;
+                        remcom_out_buffer[len++] = 0;
+                        kdb_parse(remcom_out_buffer);
+                        strcpy(remcom_out_buffer, "OK");
+                }
+                break;
+#endif
+        }
+}
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+        struct task_struct *thread;
+        char *ptr;
+        switch (remcom_in_buffer[1]) {
+        case 'g':
+                ptr = &remcom_in_buffer[2];
+                kgdb_hex2long(&ptr, &ks->threadid);
+                thread = getthread(ks->linux_regs, ks->threadid);
+                if (!thread && ks->threadid > 0) {
+                        error_packet(remcom_out_buffer, -EINVAL);
+                        break;
+                }
+                kgdb_usethread = thread;
+                ks->kgdb_usethreadid = ks->threadid;
+                strcpy(remcom_out_buffer, "OK");
+                break;
+        case 'c':
+                ptr = &remcom_in_buffer[2];
+                kgdb_hex2long(&ptr, &ks->threadid);
+                if (!ks->threadid) {
+                        kgdb_contthread = NULL;
+                } else {
+                        thread = getthread(ks->linux_regs, ks->threadid);
+                        if (!thread && ks->threadid > 0) {
+                                error_packet(remcom_out_buffer, -EINVAL);
+                                break;
+                        }
+                        kgdb_contthread = thread;
+                }
+                strcpy(remcom_out_buffer, "OK");
+                break;
+        }
+}
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+        char *ptr = &remcom_in_buffer[1];
+        struct task_struct *thread;
+        kgdb_hex2long(&ptr, &ks->threadid);
+        thread = getthread(ks->linux_regs, ks->threadid);
+        if (thread)
+                strcpy(remcom_out_buffer, "OK");
+        else
+                error_packet(remcom_out_buffer, -EINVAL);
+}
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+        /*
+         * Since GDB-5.3, it's been drafted that '0' is a software
+         * breakpoint, '1' is a hardware breakpoint, so let's do that.
+         */
+        char *bpt_type = &remcom_in_buffer[1];
+        char *ptr = &remcom_in_buffer[2];
+        unsigned long addr;
+        unsigned long length;
+        int error = 0;
+        if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+                /* Unsupported */
+                if (*bpt_type > '4')
+                        return;
+        } else {
+                if (*bpt_type != '0' && *bpt_type != '1')
+                        /* Unsupported. */
+                        return;
+        }
+        /*
+         * Test if this is a hardware breakpoint, and
+         * if we support it:
+         */
+        if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+                /* Unsupported. */
+                return;
+        if (*(ptr++) != ',') {
+                error_packet(remcom_out_buffer, -EINVAL);
+                return;
+        }
+        if (!kgdb_hex2long(&ptr, &addr)) {
+                error_packet(remcom_out_buffer, -EINVAL);
+                return;
+        }
+        if (*(ptr++) != ',' ||
+                !kgdb_hex2long(&ptr, &length)) {
+                error_packet(remcom_out_buffer, -EINVAL);
+                return;
+        }
+        if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+                error = dbg_set_sw_break(addr);
+        else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+                error = dbg_remove_sw_break(addr);
+        else if (remcom_in_buffer[0] == 'Z')
+                error = arch_kgdb_ops.set_hw_breakpoint(addr,
+                        (int)length, *bpt_type - '0');
+        else if (remcom_in_buffer[0] == 'z')
+                error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+                        (int) length, *bpt_type - '0');
+        if (error == 0)
+                strcpy(remcom_out_buffer, "OK");
+        else
+                error_packet(remcom_out_buffer, error);
+}
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+        /* C09 == pass exception
+         * C15 == detach kgdb, pass exception
+         */
+        if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+                ks->pass_exception = 1;
+                remcom_in_buffer[0] = 'c';
+        } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+                ks->pass_exception = 1;
+                remcom_in_buffer[0] = 'D';
+                dbg_remove_all_break();
+                kgdb_connected = 0;
+                return 1;
+        } else {
+                gdbstub_msg_write("KGDB only knows signal 9 (pass)"
+                        " and 15 (pass and disconnect)\n"
+                        "Executing a continue without signal passing\n", 0);
+                remcom_in_buffer[0] = 'c';
+        }
+        /* Indicate fall through */
+        return -1;
+}
+/*
+ * This function performs all gdbserial command procesing
+ */
+int gdb_serial_stub(struct kgdb_state *ks)
+{
+        int error = 0;
+        int tmp;
+        /* Clear the out buffer. */
+        memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+        if (kgdb_connected) {
+                unsigned char thref[8];
+                char *ptr;
+                /* Reply to host that an exception has occurred */
+                ptr = remcom_out_buffer;
+                *ptr++ = 'T';
+                ptr = pack_hex_byte(ptr, ks->signo);
+                ptr += strlen(strcpy(ptr, "thread:"));
+                int_to_threadref(thref, shadow_pid(current->pid));
+                ptr = pack_threadid(ptr, thref);
+                *ptr++ = ';';
+                put_packet(remcom_out_buffer);
+        }
+        kgdb_usethread = kgdb_info[ks->cpu].task;
+        ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+        ks->pass_exception = 0;
+        while (1) {
+                error = 0;
+                /* Clear the out buffer. */
+                memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+                get_packet(remcom_in_buffer);
+                switch (remcom_in_buffer[0]) {
+                case '?': /* gdbserial status */
+                        gdb_cmd_status(ks);
+                        break;
+                case 'g': /* return the value of the CPU registers */
+                        gdb_cmd_getregs(ks);
+                        break;
+                case 'G': /* set the value of the CPU registers - return OK */
+                        gdb_cmd_setregs(ks);
+                        break;
+                case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
+                        gdb_cmd_memread(ks);
+                        break;
+                case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+                        gdb_cmd_memwrite(ks);
+                        break;
+                case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+                        gdb_cmd_binwrite(ks);
+                        break;
+                        /* kill or detach. KGDB should treat this like a
+                         * continue.
+                         */
+                case 'D': /* Debugger detach */
+                case 'k': /* Debugger detach via kill */
+                        gdb_cmd_detachkill(ks);
+                        goto default_handle;
+                case 'R': /* Reboot */
+                        if (gdb_cmd_reboot(ks))
+                                goto default_handle;
+                        break;
+                case 'q': /* query command */
+                        gdb_cmd_query(ks);
+                        break;
+                case 'H': /* task related */
+                        gdb_cmd_task(ks);
+                        break;
+                case 'T': /* Query thread status */
+                        gdb_cmd_thread(ks);
+                        break;
+                case 'z': /* Break point remove */
+                case 'Z': /* Break point set */
+                        gdb_cmd_break(ks);
+                        break;
+#ifdef CONFIG_KGDB_KDB
+                case '3': /* Escape into back into kdb */
+                        if (remcom_in_buffer[1] == '\0') {
+                                gdb_cmd_detachkill(ks);
+                                return DBG_PASS_EVENT;
+                        }
+#endif
+                case 'C': /* Exception passing */
+                        tmp = gdb_cmd_exception_pass(ks);
+                        if (tmp > 0)
+                                goto default_handle;
+                        if (tmp == 0)
+                                break;
+                        /* Fall through on tmp < 0 */
+                case 'c': /* Continue packet */
+                case 's': /* Single step packet */
+                        if (kgdb_contthread && kgdb_contthread != current) {
+                                /* Can't switch threads in kgdb */
+                                error_packet(remcom_out_buffer, -EINVAL);
+                                break;
+                        }
+                        dbg_activate_sw_breakpoints();
+                        /* Fall through to default processing */
+                default:
+default_handle:
+                        error = kgdb_arch_handle_exception(ks->ex_vector,
+                                                ks->signo,
+                                                ks->err_code,
+                                                remcom_in_buffer,
+                                                remcom_out_buffer,
+                                                ks->linux_regs);
+                        /*
+                         * Leave cmd processing on error, detach,
+                         * kill, continue, or single step.
+                         */
+                        if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+                            remcom_in_buffer[0] == 'k') {
+                                error = 0;
+                                goto kgdb_exit;
+                        }
+                }
+                /* reply to the request */
+                put_packet(remcom_out_buffer);
+        }
+kgdb_exit:
+        if (ks->pass_exception)
+                error = 1;
+        return error;
+}
+int gdbstub_state(struct kgdb_state *ks, char *cmd)
+{
+        int error;
+        switch (cmd[0]) {
+        case 'e':
+                error = kgdb_arch_handle_exception(ks->ex_vector,
+                                                   ks->signo,
+                                                   ks->err_code,
+                                                   remcom_in_buffer,
+                                                   remcom_out_buffer,
+                                                   ks->linux_regs);
+                return error;
+        case 's':
+        case 'c':
+                strcpy(remcom_in_buffer, cmd);
+                return 0;
+        case '?':
+                gdb_cmd_status(ks);
+                break;
+        case '\0':
+                strcpy(remcom_out_buffer, "");
+                break;
+        }
+        dbg_io_ops->write_char('+');
+        put_packet(remcom_out_buffer);
+        return 0;
+}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
+gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d4fc58f4b88d
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
+#
+CCVERSION       := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
+obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
+obj-$(CONFIG_KDB_KEYBOARD)    += kdb_keyboard.o
+clean-files := gen-kdb_cmds.c
+quiet_cmd_gen-kdb = GENKDB  $@
+      cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
+                /^\#/{next} \
+                /^[ \t]*$$/{next} \
+                {gsub(/"/, "\\\"", $$0); \
+                  print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
+                END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print "  kdb_cmd" i ","}; print("  NULL\n};");}' \
+                $(filter-out %/Makefile,$^) > $@#
+$(obj)/gen-kdb_cmds.c:  $(src)/kdb_cmds $(src)/Makefile
+        $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..75bd9b3ebbb7
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,564 @@
+/*
+ * Kernel Debugger Architecture Independent Breakpoint Handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdb.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include "kdb_private.h"
+/*
+ * Table of kdb_breakpoints
+ */
+kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
+static void kdb_setsinglestep(struct pt_regs *regs)
+{
+        KDB_STATE_SET(DOING_SS);
+}
+static char *kdb_rwtypes[] = {
+        "Instruction(i)",
+        "Instruction(Register)",
+        "Data Write",
+        "I/O",
+        "Data Access"
+};
+static char *kdb_bptype(kdb_bp_t *bp)
+{
+        if (bp->bp_type < 0 || bp->bp_type > 4)
+                return "";
+        return kdb_rwtypes[bp->bp_type];
+}
+static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
+{
+        int nextarg = *nextargp;
+        int diag;
+        bp->bph_length = 1;
+        if ((argc + 1) != nextarg) {
+                if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
+                        bp->bp_type = BP_ACCESS_WATCHPOINT;
+                else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
+                        bp->bp_type = BP_WRITE_WATCHPOINT;
+                else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
+                        bp->bp_type = BP_HARDWARE_BREAKPOINT;
+                else
+                        return KDB_ARGCOUNT;
+                bp->bph_length = 1;
+                nextarg++;
+                if ((argc + 1) != nextarg) {
+                        unsigned long len;
+                        diag = kdbgetularg((char *)argv[nextarg],
+                                           &len);
+                        if (diag)
+                                return diag;
+                        if (len > 8)
+                                return KDB_BADLENGTH;
+                        bp->bph_length = len;
+                        nextarg++;
+                }
+                if ((argc + 1) != nextarg)
+                        return KDB_ARGCOUNT;
+        }
+        *nextargp = nextarg;
+        return 0;
+}
+static int _kdb_bp_remove(kdb_bp_t *bp)
+{
+        int ret = 1;
+        if (!bp->bp_installed)
+                return ret;
+        if (!bp->bp_type)
+                ret = dbg_remove_sw_break(bp->bp_addr);
+        else
+                ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
+                         bp->bph_length,
+                         bp->bp_type);
+        if (ret == 0)
+                bp->bp_installed = 0;
+        return ret;
+}
+static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
+{
+        if (KDB_DEBUG(BP))
+                kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
+        /*
+         * Setup single step
+         */
+        kdb_setsinglestep(regs);
+        /*
+         * Reset delay attribute
+         */
+        bp->bp_delay = 0;
+        bp->bp_delayed = 1;
+}
+static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
+{
+        int ret;
+        /*
+         * Install the breakpoint, if it is not already installed.
+         */
+        if (KDB_DEBUG(BP))
+                kdb_printf("%s: bp_installed %d\n",
+                           __func__, bp->bp_installed);
+        if (!KDB_STATE(SSBPT))
+                bp->bp_delay = 0;
+        if (bp->bp_installed)
+                return 1;
+        if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
+                if (KDB_DEBUG(BP))
+                        kdb_printf("%s: delayed bp\n", __func__);
+                kdb_handle_bp(regs, bp);
+                return 0;
+        }
+        if (!bp->bp_type)
+                ret = dbg_set_sw_break(bp->bp_addr);
+        else
+                ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
+                         bp->bph_length,
+                         bp->bp_type);
+        if (ret == 0) {
+                bp->bp_installed = 1;
+        } else {
+                kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
+                           __func__, bp->bp_addr);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * kdb_bp_install
+ *
+ *      Install kdb_breakpoints prior to returning from the
+ *      kernel debugger.  This allows the kdb_breakpoints to be set
+ *      upon functions that are used internally by kdb, such as
+ *      printk().  This function is only called once per kdb session.
+ */
+void kdb_bp_install(struct pt_regs *regs)
+{
+        int i;
+        for (i = 0; i < KDB_MAXBPT; i++) {
+                kdb_bp_t *bp = &kdb_breakpoints[i];
+                if (KDB_DEBUG(BP)) {
+                        kdb_printf("%s: bp %d bp_enabled %d\n",
+                                   __func__, i, bp->bp_enabled);
+                }
+                if (bp->bp_enabled)
+                        _kdb_bp_install(regs, bp);
+        }
+}
+/*
+ * kdb_bp_remove
+ *
+ *      Remove kdb_breakpoints upon entry to the kernel debugger.
+ *
+ * Parameters:
+ *      None.
+ * Outputs:
+ *      None.
+ * Returns:
+ *      None.
+ * Locking:
+ *      None.
+ * Remarks:
+ */
+void kdb_bp_remove(void)
+{
+        int i;
+        for (i = KDB_MAXBPT - 1; i >= 0; i--) {
+                kdb_bp_t *bp = &kdb_breakpoints[i];
+                if (KDB_DEBUG(BP)) {
+                        kdb_printf("%s: bp %d bp_enabled %d\n",
+                                   __func__, i, bp->bp_enabled);
+                }
+                if (bp->bp_enabled)
+                        _kdb_bp_remove(bp);
+        }
+}
+/*
+ * kdb_printbp
+ *
+ *      Internal function to format and print a breakpoint entry.
+ *
+ * Parameters:
+ *      None.
+ * Outputs:
+ *      None.
+ * Returns:
+ *      None.
+ * Locking:
+ *      None.
+ * Remarks:
+ */
+static void kdb_printbp(kdb_bp_t *bp, int i)
+{
+        kdb_printf("%s ", kdb_bptype(bp));
+        kdb_printf("BP #%d at ", i);
+        kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
+        if (bp->bp_enabled)
+                kdb_printf("\n    is enabled");
+        else
+                kdb_printf("\n    is disabled");
+        kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+                   bp->bp_addr, bp->bp_type, bp->bp_installed);
+        kdb_printf("\n");
+}
+/*
+ * kdb_bp
+ *
+ *      Handle the bp commands.
+ *
+ *      [bp|bph] <addr-expression> [DATAR|DATAW]
+ *
+ * Parameters:
+ *      argc    Count of arguments in argv
+ *      argv    Space delimited command line arguments
+ * Outputs:
+ *      None.
+ * Returns:
+ *      Zero for success, a kdb diagnostic if failure.
+ * Locking:
+ *      None.
+ * Remarks:
+ *
+ *      bp      Set breakpoint on all cpus.  Only use hardware assist if need.
+ *      bph     Set breakpoint on all cpus.  Force hardware register
+ */
+static int kdb_bp(int argc, const char **argv)
+{
+        int i, bpno;
+        kdb_bp_t *bp, *bp_check;
+        int diag;
+        int free;
+        char *symname = NULL;
+        long offset = 0ul;
+        int nextarg;
+        kdb_bp_t template = {0};
+        if (argc == 0) {
+                /*
+                 * Display breakpoint table
+                 */
+                for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
+                     bpno++, bp++) {
+                        if (bp->bp_free)
+                                continue;
+                        kdb_printbp(bp, bpno);
+                }
+                return 0;
+        }
+        nextarg = 1;
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
+                             &offset, &symname);
+        if (diag)
+                return diag;
+        if (!template.bp_addr)
+                return KDB_BADINT;
+        /*
+         * Find an empty bp structure to allocate
+         */
+        free = KDB_MAXBPT;
+        for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
+                if (bp->bp_free)
+                        break;
+        }
+        if (bpno == KDB_MAXBPT)
+                return KDB_TOOMANYBPT;
+        if (strcmp(argv[0], "bph") == 0) {
+                template.bp_type = BP_HARDWARE_BREAKPOINT;
+                diag = kdb_parsebp(argc, argv, &nextarg, &template);
+                if (diag)
+                        return diag;
+        } else {
+                template.bp_type = BP_BREAKPOINT;
+        }
+        /*
+         * Check for clashing breakpoints.
+         *
+         * Note, in this design we can't have hardware breakpoints
+         * enabled for both read and write on the same address.
+         */
+        for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
+             i++, bp_check++) {
+                if (!bp_check->bp_free &&
+                    bp_check->bp_addr == template.bp_addr) {
+                        kdb_printf("You already have a breakpoint at "
+                                   kdb_bfd_vma_fmt0 "\n", template.bp_addr);
+                        return KDB_DUPBPT;
+                }
+        }
+        template.bp_enabled = 1;
+        /*
+         * Actually allocate the breakpoint found earlier
+         */
+        *bp = template;
+        bp->bp_free = 0;
+        kdb_printbp(bp, bpno);
+        return 0;
+}
+/*
+ * kdb_bc
+ *
+ *      Handles the 'bc', 'be', and 'bd' commands
+ *
+ *      [bd|bc|be] <breakpoint-number>
+ *      [bd|bc|be] *
+ *
+ * Parameters:
+ *      argc    Count of arguments in argv
+ *      argv    Space delimited command line arguments
+ * Outputs:
+ *      None.
+ * Returns:
+ *      Zero for success, a kdb diagnostic for failure
+ * Locking:
+ *      None.
+ * Remarks:
+ */
+static int kdb_bc(int argc, const char **argv)
+{
+        unsigned long addr;
+        kdb_bp_t *bp = NULL;
+        int lowbp = KDB_MAXBPT;
+        int highbp = 0;
+        int done = 0;
+        int i;
+        int diag = 0;
+        int cmd;                        /* KDBCMD_B? */
+#define KDBCMD_BC       0
+#define KDBCMD_BE       1
+#define KDBCMD_BD       2
+        if (strcmp(argv[0], "be") == 0)
+                cmd = KDBCMD_BE;
+        else if (strcmp(argv[0], "bd") == 0)
+                cmd = KDBCMD_BD;
+        else
+                cmd = KDBCMD_BC;
+        if (argc != 1)
+                return KDB_ARGCOUNT;
+        if (strcmp(argv[1], "*") == 0) {
+                lowbp = 0;
+                highbp = KDB_MAXBPT;
+        } else {
+                diag = kdbgetularg(argv[1], &addr);
+                if (diag)
+                        return diag;
+                /*
+                 * For addresses less than the maximum breakpoint number,
+                 * assume that the breakpoint number is desired.
+                 */
+                if (addr < KDB_MAXBPT) {
+                        bp = &kdb_breakpoints[addr];
+                        lowbp = highbp = addr;
+                        highbp++;
+                } else {
+                        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
+                            i++, bp++) {
+                                if (bp->bp_addr == addr) {
+                                        lowbp = highbp = i;
+                                        highbp++;
+                                        break;
+                                }
+                        }
+                }
+        }
+        /*
+         * Now operate on the set of breakpoints matching the input
+         * criteria (either '*' for all, or an individual breakpoint).
+         */
+        for (bp = &kdb_breakpoints[lowbp], i = lowbp;
+            i < highbp;
+            i++, bp++) {
+                if (bp->bp_free)
+                        continue;
+                done++;
+                switch (cmd) {
+                case KDBCMD_BC:
+                        bp->bp_enabled = 0;
+                        kdb_printf("Breakpoint %d at "
+                                   kdb_bfd_vma_fmt " cleared\n",
+                                   i, bp->bp_addr);
+                        bp->bp_addr = 0;
+                        bp->bp_free = 1;
+                        break;
+                case KDBCMD_BE:
+                        bp->bp_enabled = 1;
+                        kdb_printf("Breakpoint %d at "
+                                   kdb_bfd_vma_fmt " enabled",
+                                   i, bp->bp_addr);
+                        kdb_printf("\n");
+                        break;
+                case KDBCMD_BD:
+                        if (!bp->bp_enabled)
+                                break;
+                        bp->bp_enabled = 0;
+                        kdb_printf("Breakpoint %d at "
+                                   kdb_bfd_vma_fmt " disabled\n",
+                                   i, bp->bp_addr);
+                        break;
+                }
+                if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
+                        bp->bp_delay = 0;
+                        KDB_STATE_CLEAR(SSBPT);
+                }
+        }
+        return (!done) ? KDB_BPTNOTFOUND : 0;
+}
+/*
+ * kdb_ss
+ *
+ *      Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
+ *      commands.
+ *
+ *      ss
+ *      ssb
+ *
+ * Parameters:
+ *      argc    Argument count
+ *      argv    Argument vector
+ * Outputs:
+ *      None.
+ * Returns:
+ *      KDB_CMD_SS[B] for success, a kdb error if failure.
+ * Locking:
+ *      None.
+ * Remarks:
+ *
+ *      Set the arch specific option to trigger a debug trap after the next
+ *      instruction.
+ *
+ *      For 'ssb', set the trace flag in the debug trap handler
+ *      after printing the current insn and return directly without
+ *      invoking the kdb command processor, until a branch instruction
+ *      is encountered.
+ */
+static int kdb_ss(int argc, const char **argv)
+{
+        int ssb = 0;
+        ssb = (strcmp(argv[0], "ssb") == 0);
+        if (argc != 0)
+                return KDB_ARGCOUNT;
+        /*
+         * Set trace flag and go.
+         */
+        KDB_STATE_SET(DOING_SS);
+        if (ssb) {
+                KDB_STATE_SET(DOING_SSB);
+                return KDB_CMD_SSB;
+        }
+        return KDB_CMD_SS;
+}
+/* Initialize the breakpoint table and register breakpoint commands. */
+void __init kdb_initbptab(void)
+{
+        int i;
+        kdb_bp_t *bp;
+        /*
+         * First time initialization.
+         */
+        memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
+        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
+                bp->bp_free = 1;
+        kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
+                "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
+                "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
+        if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
+                kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
+                "[datar [length]|dataw [length]]   Set hw brk", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("bc", kdb_bc, "<bpnum>",
+                "Clear Breakpoint", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("be", kdb_bc, "<bpnum>",
+                "Enable Breakpoint", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("bd", kdb_bc, "<bpnum>",
+                "Disable Breakpoint", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("ss", kdb_ss, "",
+                "Single Step", 1, KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("ssb", kdb_ss, "",
+                "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
+        /*
+         * Architecture dependent initialization.
+         */
+}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..2f62fe85f16a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
+/*
+ * Kernel Debugger Architecture Independent Stack Traceback
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kdb.h>
+#include <linux/nmi.h>
+#include <asm/system.h>
+#include "kdb_private.h"
+static void kdb_show_stack(struct task_struct *p, void *addr)
+{
+        int old_lvl = console_loglevel;
+        console_loglevel = 15;
+        kdb_trap_printk++;
+        kdb_set_current_task(p);
+        if (addr) {
+                show_stack((struct task_struct *)p, addr);
+        } else if (kdb_current_regs) {
+#ifdef CONFIG_X86
+                show_stack(p, &kdb_current_regs->sp);
+#else
+                show_stack(p, NULL);
+#endif
+        } else {
+                show_stack(p, NULL);
+        }
+        console_loglevel = old_lvl;
+        kdb_trap_printk--;
+}
+/*
+ * kdb_bt
+ *
+ *      This function implements the 'bt' command.  Print a stack
+ *      traceback.
+ *
+ *      bt [<address-expression>]       (addr-exp is for alternate stacks)
+ *      btp <pid>                       Kernel stack for <pid>
+ *      btt <address-expression>        Kernel stack for task structure at
+ *                                      <address-expression>
+ *      bta [DRSTCZEUIMA]               All useful processes, optionally
+ *                                      filtered by state
+ *      btc [<cpu>]                     The current process on one cpu,
+ *                                      default is all cpus
+ *
+ *      bt <address-expression> refers to a address on the stack, that location
+ *      is assumed to contain a return address.
+ *
+ *      btt <address-expression> refers to the address of a struct task.
+ *
+ * Inputs:
+ *      argc    argument count
+ *      argv    argument vector
+ * Outputs:
+ *      None.
+ * Returns:
+ *      zero for success, a kdb diagnostic if error
+ * Locking:
+ *      none.
+ * Remarks:
+ *      Backtrack works best when the code uses frame pointers.  But even
+ *      without frame pointers we should get a reasonable trace.
+ *
+ *      mds comes in handy when examining the stack to do a manual traceback or
+ *      to get a starting point for bt <address-expression>.
+ */
+static int
+kdb_bt1(struct task_struct *p, unsigned long mask,
+        int argcount, int btaprompt)
+{
+        char buffer[2];
+        if (kdb_getarea(buffer[0], (unsigned long)p) ||
+            kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+                return KDB_BADADDR;
+        if (!kdb_task_state(p, mask))
+                return 0;
+        kdb_printf("Stack traceback for pid %d\n", p->pid);
+        kdb_ps1(p);
+        kdb_show_stack(p, NULL);
+        if (btaprompt) {
+                kdb_getstr(buffer, sizeof(buffer),
+                           "Enter <q> to end, <cr> to continue:");
+                if (buffer[0] == 'q') {
+                        kdb_printf("\n");
+                        return 1;
+                }
+        }
+        touch_nmi_watchdog();
+        return 0;
+}
+int
+kdb_bt(int argc, const char **argv)
+{
+        int diag;
+        int argcount = 5;
+        int btaprompt = 1;
+        int nextarg;
+        unsigned long addr;
+        long offset;
+        kdbgetintenv("BTARGS", &argcount);      /* Arguments to print */
+        kdbgetintenv("BTAPROMPT", &btaprompt);  /* Prompt after each
+                                                 * proc in bta */
+        if (strcmp(argv[0], "bta") == 0) {
+                struct task_struct *g, *p;
+                unsigned long cpu;
+                unsigned long mask = kdb_task_state_string(argc ? argv[1] :
+                                                           NULL);
+                if (argc == 0)
+                        kdb_ps_suppressed();
+                /* Run the active tasks first */
+                for_each_online_cpu(cpu) {
+                        p = kdb_curr_task(cpu);
+                        if (kdb_bt1(p, mask, argcount, btaprompt))
+                                return 0;
+                }
+                /* Now the inactive tasks */
+                kdb_do_each_thread(g, p) {
+                        if (task_curr(p))
+                                continue;
+                        if (kdb_bt1(p, mask, argcount, btaprompt))
+                                return 0;
+                } kdb_while_each_thread(g, p);
+        } else if (strcmp(argv[0], "btp") == 0) {
+                struct task_struct *p;
+                unsigned long pid;
+                if (argc != 1)
+                        return KDB_ARGCOUNT;
+                diag = kdbgetularg((char *)argv[1], &pid);
+                if (diag)
+                        return diag;
+                p = find_task_by_pid_ns(pid, &init_pid_ns);
+                if (p) {
+                        kdb_set_current_task(p);
+                        return kdb_bt1(p, ~0UL, argcount, 0);
+                }
+                kdb_printf("No process with pid == %ld found\n", pid);
+                return 0;
+        } else if (strcmp(argv[0], "btt") == 0) {
+                if (argc != 1)
+                        return KDB_ARGCOUNT;
+                diag = kdbgetularg((char *)argv[1], &addr);
+                if (diag)
+                        return diag;
+                kdb_set_current_task((struct task_struct *)addr);
+                return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+        } else if (strcmp(argv[0], "btc") == 0) {
+                unsigned long cpu = ~0;
+                struct task_struct *save_current_task = kdb_current_task;
+                char buf[80];
+                if (argc > 1)
+                        return KDB_ARGCOUNT;
+                if (argc == 1) {
+                        diag = kdbgetularg((char *)argv[1], &cpu);
+                        if (diag)
+                                return diag;
+                }
+                /* Recursive use of kdb_parse, do not use argv after
+                 * this point */
+                argv = NULL;
+                if (cpu != ~0) {
+                        if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+                                kdb_printf("no process for cpu %ld\n", cpu);
+                                return 0;
+                        }
+                        sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+                        kdb_parse(buf);
+                        return 0;
+                }
+                kdb_printf("btc: cpu status: ");
+                kdb_parse("cpu\n");
+                for_each_online_cpu(cpu) {
+                        sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+                        kdb_parse(buf);
+                        touch_nmi_watchdog();
+                }
+                kdb_set_current_task(save_current_task);
+                return 0;
+        } else {
+                if (argc) {
+                        nextarg = 1;
+                        diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+                                             &offset, NULL);
+                        if (diag)
+                                return diag;
+                        kdb_show_stack(kdb_current_task, (void *)addr);
+                        return 0;
+                } else {
+                        return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+                }
+        }
+        /* NOTREACHED */
+        return 0;
+}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
+# Initial commands for kdb, alter to suit your needs.
+# These commands are executed in kdb_init() context, no SMP, no
+# processes.  Commands that require process data (including stack or
+# registers) are not reliable this early.  set and bp commands should
+# be safe.  Global breakpoint commands affect each cpu as it is booted.
+# Standard debugging information for first level support, just type archkdb
+# or archkdbcpu or archkdbshort at the kdb prompt.
+defcmd dumpcommon "" "Common kdb debugging"
+  set BTAPROMPT 0
+  set LINES 10000
+  -summary
+  -cpu
+  -ps
+  -dmesg 600
+  -bt
+endefcmd
+defcmd dumpall "" "First line debugging"
+  set BTSYMARG 1
+  set BTARGS 9
+  pid R
+  -dumpcommon
+  -bta
+endefcmd
+defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
+  set BTSYMARG 1
+  set BTARGS 9
+  pid R
+  -dumpcommon
+  -btc
+endefcmd
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..bf6e8270e957
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
+/*
+ * Created by: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kdebug.h>
+#include "kdb_private.h"
+#include "../debug_core.h"
+/*
+ * KDB interface to KGDB internals
+ */
+get_char_func kdb_poll_funcs[] = {
+        dbg_io_get_char,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+};
+EXPORT_SYMBOL_GPL(kdb_poll_funcs);
+int kdb_poll_idx = 1;
+EXPORT_SYMBOL_GPL(kdb_poll_idx);
+int kdb_stub(struct kgdb_state *ks)
+{
+        int error = 0;
+        kdb_bp_t *bp;
+        unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+        kdb_reason_t reason = KDB_REASON_OOPS;
+        kdb_dbtrap_t db_result = KDB_DB_NOBPT;
+        int i;
+        if (KDB_STATE(REENTRY)) {
+                reason = KDB_REASON_SWITCH;
+                KDB_STATE_CLEAR(REENTRY);
+                addr = instruction_pointer(ks->linux_regs);
+        }
+        ks->pass_exception = 0;
+        if (atomic_read(&kgdb_setting_breakpoint))
+                reason = KDB_REASON_KEYBOARD;
+        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+                if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
+                        reason = KDB_REASON_BREAK;
+                        db_result = KDB_DB_BPT;
+                        if (addr != instruction_pointer(ks->linux_regs))
+                                kgdb_arch_set_pc(ks->linux_regs, addr);
+                        break;
+                }
+        }
+        if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
+                for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
+                        if (bp->bp_free)
+                                continue;
+                        if (bp->bp_addr == addr) {
+                                bp->bp_delay = 1;
+                                bp->bp_delayed = 1;
+        /*
+         * SSBPT is set when the kernel debugger must single step a
+         * task in order to re-establish an instruction breakpoint
+         * which uses the instruction replacement mechanism.  It is
+         * cleared by any action that removes the need to single-step
+         * the breakpoint.
+         */
+                                reason = KDB_REASON_BREAK;
+                                db_result = KDB_DB_BPT;
+                                KDB_STATE_SET(SSBPT);
+                                break;
+                        }
+                }
+        }
+        if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
+                ks->signo == SIGTRAP) {
+                reason = KDB_REASON_SSTEP;
+                db_result = KDB_DB_BPT;
+        }
+        /* Set initial kdb state variables */
+        KDB_STATE_CLEAR(KGDB_TRANS);
+        kdb_initial_cpu = ks->cpu;
+        kdb_current_task = kgdb_info[ks->cpu].task;
+        kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+        /* Remove any breakpoints as needed by kdb and clear single step */
+        kdb_bp_remove();
+        KDB_STATE_CLEAR(DOING_SS);
+        KDB_STATE_CLEAR(DOING_SSB);
+        KDB_STATE_SET(PAGER);
+        /* zero out any offline cpu data */
+        for_each_present_cpu(i) {
+                if (!cpu_online(i)) {
+                        kgdb_info[i].debuggerinfo = NULL;
+                        kgdb_info[i].task = NULL;
+                }
+        }
+        if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
+                ks->pass_exception = 1;
+                KDB_FLAG_SET(CATASTROPHIC);
+        }
+        kdb_initial_cpu = ks->cpu;
+        if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
+                KDB_STATE_CLEAR(SSBPT);
+                KDB_STATE_CLEAR(DOING_SS);
+        } else {
+                /* Start kdb main loop */
+                error = kdb_main_loop(KDB_REASON_ENTER, reason,
+                                      ks->err_code, db_result, ks->linux_regs);
+        }
+        /*
+         * Upon exit from the kdb main loop setup break points and restart
+         * the system based on the requested continue state
+         */
+        kdb_initial_cpu = -1;
+        kdb_current_task = NULL;
+        kdb_current_regs = NULL;
+        KDB_STATE_CLEAR(PAGER);
+        kdbnearsym_cleanup();
+        if (error == KDB_CMD_KGDB) {
+                if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
+        /*
+         * This inteface glue which allows kdb to transition in into
+         * the gdb stub.  In order to do this the '?' or '' gdb serial
+         * packet response is processed here.  And then control is
+         * passed to the gdbstub.
+         */
+                        if (KDB_STATE(DOING_KGDB))
+                                gdbstub_state(ks, "?");
+                        else
+                                gdbstub_state(ks, "");
+                        KDB_STATE_CLEAR(DOING_KGDB);
+                        KDB_STATE_CLEAR(DOING_KGDB2);
+                }
+                return DBG_PASS_EVENT;
+        }
+        kdb_bp_install(ks->linux_regs);
+        dbg_activate_sw_breakpoints();
+        /* Set the exit state to a single step or a continue */
+        if (KDB_STATE(DOING_SS))
+                gdbstub_state(ks, "s");
+        else
+                gdbstub_state(ks, "c");
+        KDB_FLAG_CLEAR(CATASTROPHIC);
+        /* Invoke arch specific exception handling prior to system resume */
+        kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
+        if (ks->pass_exception)
+                kgdb_info[ks->cpu].ret_state = 1;
+        if (error == KDB_CMD_CPU) {
+                KDB_STATE_SET(REENTRY);
+                /*
+                 * Force clear the single step bit because kdb emulates this
+                 * differently vs the gdbstub
+                 */
+                kgdb_single_step = 0;
+                dbg_deactivate_sw_breakpoints();
+                return DBG_SWITCH_CPU_EVENT;
+        }
+        return kgdb_info[ks->cpu].ret_state;
+}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..c9b7f4f90bba
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
+/*
+ * Kernel Debugger Architecture Independent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kdev_t.h>
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/kallsyms.h>
+#include "kdb_private.h"
+#define CMD_BUFLEN 256
+char kdb_prompt_str[CMD_BUFLEN];
+int kdb_trap_printk;
+static void kgdb_transition_check(char *buffer)
+{
+        int slen = strlen(buffer);
+        if (strncmp(buffer, "$?#3f", slen) != 0 &&
+            strncmp(buffer, "$qSupported#37", slen) != 0 &&
+            strncmp(buffer, "+$qSupported#37", slen) != 0) {
+                KDB_STATE_SET(KGDB_TRANS);
+                kdb_printf("%s", buffer);
+        }
+}
+static int kdb_read_get_key(char *buffer, size_t bufsize)
+{
+#define ESCAPE_UDELAY 1000
+#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
+        char escape_data[5];    /* longest vt100 escape sequence is 4 bytes */
+        char *ped = escape_data;
+        int escape_delay = 0;
+        get_char_func *f, *f_escape = NULL;
+        int key;
+        for (f = &kdb_poll_funcs[0]; ; ++f) {
+                if (*f == NULL) {
+                        /* Reset NMI watchdog once per poll loop */
+                        touch_nmi_watchdog();
+                        f = &kdb_poll_funcs[0];
+                }
+                if (escape_delay == 2) {
+                        *ped = '\0';
+                        ped = escape_data;
+                        --escape_delay;
+                }
+                if (escape_delay == 1) {
+                        key = *ped++;
+                        if (!*ped)
+                                --escape_delay;
+                        break;
+                }
+                key = (*f)();
+                if (key == -1) {
+                        if (escape_delay) {
+                                udelay(ESCAPE_UDELAY);
+                                --escape_delay;
+                        }
+                        continue;
+                }
+                if (bufsize <= 2) {
+                        if (key == '\r')
+                                key = '\n';
+                        *buffer++ = key;
+                        *buffer = '\0';
+                        return -1;
+                }
+                if (escape_delay == 0 && key == '\e') {
+                        escape_delay = ESCAPE_DELAY;
+                        ped = escape_data;
+                        f_escape = f;
+                }
+                if (escape_delay) {
+                        *ped++ = key;
+                        if (f_escape != f) {
+                                escape_delay = 2;
+                                continue;
+                        }
+                        if (ped - escape_data == 1) {
+                                /* \e */
+                                continue;
+                        } else if (ped - escape_data == 2) {
+                                /* \e<something> */
+                                if (key != '[')
+                                        escape_delay = 2;
+                                continue;
+                        } else if (ped - escape_data == 3) {
+                                /* \e[<something> */
+                                int mapkey = 0;
+                                switch (key) {
+                                case 'A': /* \e[A, up arrow */
+                                        mapkey = 16;
+                                        break;
+                                case 'B': /* \e[B, down arrow */
+                                        mapkey = 14;
+                                        break;
+                                case 'C': /* \e[C, right arrow */
+                                        mapkey = 6;
+                                        break;
+                                case 'D': /* \e[D, left arrow */
+                                        mapkey = 2;
+                                        break;
+                                case '1': /* dropthrough */
+                                case '3': /* dropthrough */
+                                /* \e[<1,3,4>], may be home, del, end */
+                                case '4':
+                                        mapkey = -1;
+                                        break;
+                                }
+                                if (mapkey != -1) {
+                                        if (mapkey > 0) {
+                                                escape_data[0] = mapkey;
+                                                escape_data[1] = '\0';
+                                        }
+                                        escape_delay = 2;
+                                }
+                                continue;
+                        } else if (ped - escape_data == 4) {
+                                /* \e[<1,3,4><something> */
+                                int mapkey = 0;
+                                if (key == '~') {
+                                        switch (escape_data[2]) {
+                                        case '1': /* \e[1~, home */
+                                                mapkey = 1;
+                                                break;
+                                        case '3': /* \e[3~, del */
+                                                mapkey = 4;
+                                                break;
+                                        case '4': /* \e[4~, end */
+                                                mapkey = 5;
+                                                break;
+                                        }
+                                }
+                                if (mapkey > 0) {
+                                        escape_data[0] = mapkey;
+                                        escape_data[1] = '\0';
+                                }
+                                escape_delay = 2;
+                                continue;
+                        }
+                }
+                break;  /* A key to process */
+        }
+        return key;
+}
+/*
+ * kdb_read
+ *
+ *      This function reads a string of characters, terminated by
+ *      a newline, or by reaching the end of the supplied buffer,
+ *      from the current kernel debugger console device.
+ * Parameters:
+ *      buffer  - Address of character buffer to receive input characters.
+ *      bufsize - size, in bytes, of the character buffer
+ * Returns:
+ *      Returns a pointer to the buffer containing the received
+ *      character string.  This string will be terminated by a
+ *      newline character.
+ * Locking:
+ *      No locks are required to be held upon entry to this
+ *      function.  It is not reentrant - it relies on the fact
+ *      that while kdb is running on only one "master debug" cpu.
+ * Remarks:
+ *
+ * The buffer size must be >= 2.  A buffer size of 2 means that the caller only
+ * wants a single key.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right.  The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters.  kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources and by the need to sometimes
+ * return after just one key.  Escape sequence processing has to be done as
+ * states in the polling loop.
+ */
+static char *kdb_read(char *buffer, size_t bufsize)
+{
+        char *cp = buffer;
+        char *bufend = buffer+bufsize-2;        /* Reserve space for newline
+                                                 * and null byte */
+        char *lastchar;
+        char *p_tmp;
+        char tmp;
+        static char tmpbuffer[CMD_BUFLEN];
+        int len = strlen(buffer);
+        int len_tmp;
+        int tab = 0;
+        int count;
+        int i;
+        int diag, dtab_count;
+        int key;
+        diag = kdbgetintenv("DTABCOUNT", &dtab_count);
+        if (diag)
+                dtab_count = 30;
+        if (len > 0) {
+                cp += len;
+                if (*(buffer+len-1) == '\n')
+                        cp--;
+        }
+        lastchar = cp;
+        *cp = '\0';
+        kdb_printf("%s", buffer);
+poll_again:
+        key = kdb_read_get_key(buffer, bufsize);
+        if (key == -1)
+                return buffer;
+        if (key != 9)
+                tab = 0;
+        switch (key) {
+        case 8: /* backspace */
+                if (cp > buffer) {
+                        if (cp < lastchar) {
+                                memcpy(tmpbuffer, cp, lastchar - cp);
+                                memcpy(cp-1, tmpbuffer, lastchar - cp);
+                        }
+                        *(--lastchar) = '\0';
+                        --cp;
+                        kdb_printf("\b%s \r", cp);
+                        tmp = *cp;
+                        *cp = '\0';
+                        kdb_printf(kdb_prompt_str);
+                        kdb_printf("%s", buffer);
+                        *cp = tmp;
+                }
+                break;
+        case 13: /* enter */
+                *lastchar++ = '\n';
+                *lastchar++ = '\0';
+                kdb_printf("\n");
+                return buffer;
+        case 4: /* Del */
+                if (cp < lastchar) {
+                        memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
+                        memcpy(cp, tmpbuffer, lastchar - cp - 1);
+                        *(--lastchar) = '\0';
+                        kdb_printf("%s \r", cp);
+                        tmp = *cp;
+                        *cp = '\0';
+                        kdb_printf(kdb_prompt_str);
+                        kdb_printf("%s", buffer);
+                        *cp = tmp;
+                }
+                break;
+        case 1: /* Home */
+                if (cp > buffer) {
+                        kdb_printf("\r");
+                        kdb_printf(kdb_prompt_str);
+                        cp = buffer;
+                }
+                break;
+        case 5: /* End */
+                if (cp < lastchar) {
+                        kdb_printf("%s", cp);
+                        cp = lastchar;
+                }
+                break;
+        case 2: /* Left */
+                if (cp > buffer) {
+                        kdb_printf("\b");
+                        --cp;
+                }
+                break;
+        case 14: /* Down */
+                memset(tmpbuffer, ' ',
+                       strlen(kdb_prompt_str) + (lastchar-buffer));
+                *(tmpbuffer+strlen(kdb_prompt_str) +
+                  (lastchar-buffer)) = '\0';
+                kdb_printf("\r%s\r", tmpbuffer);
+                *lastchar = (char)key;
+                *(lastchar+1) = '\0';
+                return lastchar;
+        case 6: /* Right */
+                if (cp < lastchar) {
+                        kdb_printf("%c", *cp);
+                        ++cp;
+                }
+                break;
+        case 16: /* Up */
+                memset(tmpbuffer, ' ',
+                       strlen(kdb_prompt_str) + (lastchar-buffer));
+                *(tmpbuffer+strlen(kdb_prompt_str) +
+                  (lastchar-buffer)) = '\0';
+                kdb_printf("\r%s\r", tmpbuffer);
+                *lastchar = (char)key;
+                *(lastchar+1) = '\0';
+                return lastchar;
+        case 9: /* Tab */
+                if (tab < 2)
+                        ++tab;
+                p_tmp = buffer;
+                while (*p_tmp == ' ')
+                        p_tmp++;
+                if (p_tmp > cp)
+                        break;
+                memcpy(tmpbuffer, p_tmp, cp-p_tmp);
+                *(tmpbuffer + (cp-p_tmp)) = '\0';
+                p_tmp = strrchr(tmpbuffer, ' ');
+                if (p_tmp)
+                        ++p_tmp;
+                else
+                        p_tmp = tmpbuffer;
+                len = strlen(p_tmp);
+                count = kallsyms_symbol_complete(p_tmp,
+                                                 sizeof(tmpbuffer) -
+                                                 (p_tmp - tmpbuffer));
+                if (tab == 2 && count > 0) {
+                        kdb_printf("\n%d symbols are found.", count);
+                        if (count > dtab_count) {
+                                count = dtab_count;
+                                kdb_printf(" But only first %d symbols will"
+                                           " be printed.\nYou can change the"
+                                           " environment variable DTABCOUNT.",
+                                           count);
+                        }
+                        kdb_printf("\n");
+                        for (i = 0; i < count; i++) {
+                                if (kallsyms_symbol_next(p_tmp, i) < 0)
+                                        break;
+                                kdb_printf("%s ", p_tmp);
+                                *(p_tmp + len) = '\0';
+                        }
+                        if (i >= dtab_count)
+                                kdb_printf("...");
+                        kdb_printf("\n");
+                        kdb_printf(kdb_prompt_str);
+                        kdb_printf("%s", buffer);
+                } else if (tab != 2 && count > 0) {
+                        len_tmp = strlen(p_tmp);
+                        strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
+                        len_tmp = strlen(p_tmp);
+                        strncpy(cp, p_tmp+len, len_tmp-len + 1);
+                        len = len_tmp - len;
+                        kdb_printf("%s", cp);
+                        cp += len;
+                        lastchar += len;
+                }
+                kdb_nextline = 1; /* reset output line number */
+                break;
+        default:
+                if (key >= 32 && lastchar < bufend) {
+                        if (cp < lastchar) {
+                                memcpy(tmpbuffer, cp, lastchar - cp);
+                                memcpy(cp+1, tmpbuffer, lastchar - cp);
+                                *++lastchar = '\0';
+                                *cp = key;
+                                kdb_printf("%s\r", cp);
+                                ++cp;
+                                tmp = *cp;
+                                *cp = '\0';
+                                kdb_printf(kdb_prompt_str);
+                                kdb_printf("%s", buffer);
+                                *cp = tmp;
+                        } else {
+                                *++lastchar = '\0';
+                                *cp++ = key;
+                                /* The kgdb transition check will hide
+                                 * printed characters if we think that
+                                 * kgdb is connecting, until the check
+                                 * fails */
+                                if (!KDB_STATE(KGDB_TRANS))
+                                        kgdb_transition_check(buffer);
+                                else
+                                        kdb_printf("%c", key);
+                        }
+                        /* Special escape to kgdb */
+                        if (lastchar - buffer >= 5 &&
+                            strcmp(lastchar - 5, "$?#3f") == 0) {
+                                strcpy(buffer, "kgdb");
+                                KDB_STATE_SET(DOING_KGDB);
+                                return buffer;
+                        }
+                        if (lastchar - buffer >= 14 &&
+                            strcmp(lastchar - 14, "$qSupported#37") == 0) {
+                                strcpy(buffer, "kgdb");
+                                KDB_STATE_SET(DOING_KGDB2);
+                                return buffer;
+                        }
+                }
+                break;
+        }
+        goto poll_again;
+}
+/*
+ * kdb_getstr
+ *
+ *      Print the prompt string and read a command from the
+ *      input device.
+ *
+ * Parameters:
+ *      buffer  Address of buffer to receive command
+ *      bufsize Size of buffer in bytes
+ *      prompt  Pointer to string to use as prompt string
+ * Returns:
+ *      Pointer to command buffer.
+ * Locking:
+ *      None.
+ * Remarks:
+ *      For SMP kernels, the processor number will be
+ *      substituted for %d, %x or %o in the prompt.
+ */
+char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
+{
+        if (prompt && kdb_prompt_str != prompt)
+                strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+        kdb_printf(kdb_prompt_str);
+        kdb_nextline = 1;       /* Prompt and input resets line number */
+        return kdb_read(buffer, bufsize);
+}
+/*
+ * kdb_input_flush
+ *
+ *      Get rid of any buffered console input.
+ *
+ * Parameters:
+ *      none
+ * Returns:
+ *      nothing
+ * Locking:
+ *      none
+ * Remarks:
+ *      Call this function whenever you want to flush input.  If there is any
+ *      outstanding input, it ignores all characters until there has been no
+ *      data for approximately 1ms.
+ */
+static void kdb_input_flush(void)
+{
+        get_char_func *f;
+        int res;
+        int flush_delay = 1;
+        while (flush_delay) {
+                flush_delay--;
+empty:
+                touch_nmi_watchdog();
+                for (f = &kdb_poll_funcs[0]; *f; ++f) {
+                        res = (*f)();
+                        if (res != -1) {
+                                flush_delay = 1;
+                                goto empty;
+                        }
+                }
+                if (flush_delay)
+                        mdelay(1);
+        }
+}
+/*
+ * kdb_printf
+ *
+ *      Print a string to the output device(s).
+ *
+ * Parameters:
+ *      printf-like format and optional args.
+ * Returns:
+ *      0
+ * Locking:
+ *      None.
+ * Remarks:
+ *      use 'kdbcons->write()' to avoid polluting 'log_buf' with
+ *      kdb output.
+ *
+ *  If the user is doing a cmd args | grep srch
+ *  then kdb_grepping_flag is set.
+ *  In that case we need to accumulate full lines (ending in \n) before
+ *  searching for the pattern.
+ */
+static char kdb_buffer[256];    /* A bit too big to go on stack */
+static char *next_avail = kdb_buffer;
+static int  size_avail;
+static int  suspend_grep;
+/*
+ * search arg1 to see if it contains arg2
+ * (kdmain.c provides flags for ^pat and pat$)
+ *
+ * return 1 for found, 0 for not found
+ */
+static int kdb_search_string(char *searched, char *searchfor)
+{
+        char firstchar, *cp;
+        int len1, len2;
+        /* not counting the newline at the end of "searched" */
+        len1 = strlen(searched)-1;
+        len2 = strlen(searchfor);
+        if (len1 < len2)
+                return 0;
+        if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
+                return 0;
+        if (kdb_grep_leading) {
+                if (!strncmp(searched, searchfor, len2))
+                        return 1;
+        } else if (kdb_grep_trailing) {
+                if (!strncmp(searched+len1-len2, searchfor, len2))
+                        return 1;
+        } else {
+                firstchar = *searchfor;
+                cp = searched;
+                while ((cp = strchr(cp, firstchar))) {
+                        if (!strncmp(cp, searchfor, len2))
+                                return 1;
+                        cp++;
+                }
+        }
+        return 0;
+}
+int vkdb_printf(const char *fmt, va_list ap)
+{
+        int diag;
+        int linecount;
+        int logging, saved_loglevel = 0;
+        int saved_trap_printk;
+        int got_printf_lock = 0;
+        int retlen = 0;
+        int fnd, len;
+        char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
+        char *moreprompt = "more> ";
+        struct console *c = console_drivers;
+        static DEFINE_SPINLOCK(kdb_printf_lock);
+        unsigned long uninitialized_var(flags);
+        preempt_disable();
+        saved_trap_printk = kdb_trap_printk;
+        kdb_trap_printk = 0;
+        /* Serialize kdb_printf if multiple cpus try to write at once.
+         * But if any cpu goes recursive in kdb, just print the output,
+         * even if it is interleaved with any other text.
+         */
+        if (!KDB_STATE(PRINTF_LOCK)) {
+                KDB_STATE_SET(PRINTF_LOCK);
+                spin_lock_irqsave(&kdb_printf_lock, flags);
+                got_printf_lock = 1;
+                atomic_inc(&kdb_event);
+        } else {
+                __acquire(kdb_printf_lock);
+        }
+        diag = kdbgetintenv("LINES", &linecount);
+        if (diag || linecount <= 1)
+                linecount = 24;
+        diag = kdbgetintenv("LOGGING", &logging);
+        if (diag)
+                logging = 0;
+        if (!kdb_grepping_flag || suspend_grep) {
+                /* normally, every vsnprintf starts a new buffer */
+                next_avail = kdb_buffer;
+                size_avail = sizeof(kdb_buffer);
+        }
+        vsnprintf(next_avail, size_avail, fmt, ap);
+        /*
+         * If kdb_parse() found that the command was cmd xxx | grep yyy
+         * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
+         *
+         * Accumulate the print data up to a newline before searching it.
+         * (vsnprintf does null-terminate the string that it generates)
+         */
+        /* skip the search if prints are temporarily unconditional */
+        if (!suspend_grep && kdb_grepping_flag) {
+                cp = strchr(kdb_buffer, '\n');
+                if (!cp) {
+                        /*
+                         * Special cases that don't end with newlines
+                         * but should be written without one:
+                         *   The "[nn]kdb> " prompt should
+                         *   appear at the front of the buffer.
+                         *
+                         *   The "[nn]more " prompt should also be
+                         *     (MOREPROMPT -> moreprompt)
+                         *   written *   but we print that ourselves,
+                         *   we set the suspend_grep flag to make
+                         *   it unconditional.
+                         *
+                         */
+                        if (next_avail == kdb_buffer) {
+                                /*
+                                 * these should occur after a newline,
+                                 * so they will be at the front of the
+                                 * buffer
+                                 */
+                                cp2 = kdb_buffer;
+                                len = strlen(kdb_prompt_str);
+                                if (!strncmp(cp2, kdb_prompt_str, len)) {
+                                        /*
+                                         * We're about to start a new
+                                         * command, so we can go back
+                                         * to normal mode.
+                                         */
+                                        kdb_grepping_flag = 0;
+                                        goto kdb_printit;
+                                }
+                        }
+                        /* no newline; don't search/write the buffer
+                           until one is there */
+                        len = strlen(kdb_buffer);
+                        next_avail = kdb_buffer + len;
+                        size_avail = sizeof(kdb_buffer) - len;
+                        goto kdb_print_out;
+                }
+                /*
+                 * The newline is present; print through it or discard
+                 * it, depending on the results of the search.
+                 */
+                cp++;                /* to byte after the newline */
+                replaced_byte = *cp; /* remember what/where it was */
+                cphold = cp;
+                *cp = '\0';          /* end the string for our search */
+                /*
+                 * We now have a newline at the end of the string
+                 * Only continue with this output if it contains the
+                 * search string.
+                 */
+                fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
+                if (!fnd) {
+                        /*
+                         * At this point the complete line at the start
+                         * of kdb_buffer can be discarded, as it does
+                         * not contain what the user is looking for.
+                         * Shift the buffer left.
+                         */
+                        *cphold = replaced_byte;
+                        strcpy(kdb_buffer, cphold);
+                        len = strlen(kdb_buffer);
+                        next_avail = kdb_buffer + len;
+                        size_avail = sizeof(kdb_buffer) - len;
+                        goto kdb_print_out;
+                }
+                /*
+                 * at this point the string is a full line and
+                 * should be printed, up to the null.
+                 */
+        }
+kdb_printit:
+        /*
+         * Write to all consoles.
+         */
+        retlen = strlen(kdb_buffer);
+        if (!dbg_kdb_mode && kgdb_connected) {
+                gdbstub_msg_write(kdb_buffer, retlen);
+        } else {
+                if (!dbg_io_ops->is_console) {
+                        len = strlen(kdb_buffer);
+                        cp = kdb_buffer;
+                        while (len--) {
+                                dbg_io_ops->write_char(*cp);
+                                cp++;
+                        }
+                }
+                while (c) {
+                        c->write(c, kdb_buffer, retlen);
+                        touch_nmi_watchdog();
+                        c = c->next;
+                }
+        }
+        if (logging) {
+                saved_loglevel = console_loglevel;
+                console_loglevel = 0;
+                printk(KERN_INFO "%s", kdb_buffer);
+        }
+        if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
+                kdb_nextline++;
+        /* check for having reached the LINES number of printed lines */
+        if (kdb_nextline == linecount) {
+                char buf1[16] = "";
+#if defined(CONFIG_SMP)
+                char buf2[32];
+#endif
+                /* Watch out for recursion here.  Any routine that calls
+                 * kdb_printf will come back through here.  And kdb_read
+                 * uses kdb_printf to echo on serial consoles ...
+                 */
+                kdb_nextline = 1;       /* In case of recursion */
+                /*
+                 * Pause until cr.
+                 */
+                moreprompt = kdbgetenv("MOREPROMPT");
+                if (moreprompt == NULL)
+                        moreprompt = "more> ";
+#if defined(CONFIG_SMP)
+                if (strchr(moreprompt, '%')) {
+                        sprintf(buf2, moreprompt, get_cpu());
+                        put_cpu();
+                        moreprompt = buf2;
+                }
+#endif
+                kdb_input_flush();
+                c = console_drivers;
+                if (!dbg_io_ops->is_console) {
+                        len = strlen(moreprompt);
+                        cp = moreprompt;
+                        while (len--) {
+                                dbg_io_ops->write_char(*cp);
+                                cp++;
+                        }
+                }
+                while (c) {
+                        c->write(c, moreprompt, strlen(moreprompt));
+                        touch_nmi_watchdog();
+                        c = c->next;
+                }
+                if (logging)
+                        printk("%s", moreprompt);
+                kdb_read(buf1, 2); /* '2' indicates to return
+                                    * immediately after getting one key. */
+                kdb_nextline = 1;       /* Really set output line 1 */
+                /* empty and reset the buffer: */
+                kdb_buffer[0] = '\0';
+                next_avail = kdb_buffer;
+                size_avail = sizeof(kdb_buffer);
+                if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+                        /* user hit q or Q */
+                        KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
+                        KDB_STATE_CLEAR(PAGER);
+                        /* end of command output; back to normal mode */
+                        kdb_grepping_flag = 0;
+                        kdb_printf("\n");
+                } else if (buf1[0] == ' ') {
+                        kdb_printf("\n");
+                        suspend_grep = 1; /* for this recursion */
+                } else if (buf1[0] == '\n') {
+                        kdb_nextline = linecount - 1;
+                        kdb_printf("\r");
+                        suspend_grep = 1; /* for this recursion */
+                } else if (buf1[0] && buf1[0] != '\n') {
+                        /* user hit something other than enter */
+                        suspend_grep = 1; /* for this recursion */
+                        kdb_printf("\nOnly 'q' or 'Q' are processed at more "
+                                   "prompt, input ignored\n");
+                } else if (kdb_grepping_flag) {
+                        /* user hit enter */
+                        suspend_grep = 1; /* for this recursion */
+                        kdb_printf("\n");
+                }
+                kdb_input_flush();
+        }
+        /*
+         * For grep searches, shift the printed string left.
+         *  replaced_byte contains the character that was overwritten with
+         *  the terminating null, and cphold points to the null.
+         * Then adjust the notion of available space in the buffer.
+         */
+        if (kdb_grepping_flag && !suspend_grep) {
+                *cphold = replaced_byte;
+                strcpy(kdb_buffer, cphold);
+                len = strlen(kdb_buffer);
+                next_avail = kdb_buffer + len;
+                size_avail = sizeof(kdb_buffer) - len;
+        }
+kdb_print_out:
+        suspend_grep = 0; /* end of what may have been a recursive call */
+        if (logging)
+                console_loglevel = saved_loglevel;
+        if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
+                got_printf_lock = 0;
+                spin_unlock_irqrestore(&kdb_printf_lock, flags);
+                KDB_STATE_CLEAR(PRINTF_LOCK);
+                atomic_dec(&kdb_event);
+        } else {
+                __release(kdb_printf_lock);
+        }
+        kdb_trap_printk = saved_trap_printk;
+        preempt_enable();
+        return retlen;
+}
+int kdb_printf(const char *fmt, ...)
+{
+        va_list ap;
+        int r;
+        va_start(ap, fmt);
+        r = vkdb_printf(fmt, ap);
+        va_end(ap);
+        return r;
+}
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
+/*
+ * Kernel Debugger Architecture Dependent Console I/O handler
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.
+ *
+ * Copyright (c) 1999-2006 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+#include <linux/kdb.h>
+#include <linux/keyboard.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/io.h>
+/* Keyboard Controller Registers on normal PCs. */
+#define KBD_STATUS_REG          0x64    /* Status register (R) */
+#define KBD_DATA_REG            0x60    /* Keyboard data register (R/W) */
+/* Status Register Bits */
+#define KBD_STAT_OBF            0x01    /* Keyboard output buffer full */
+#define KBD_STAT_MOUSE_OBF      0x20    /* Mouse output buffer full */
+static int kbd_exists;
+/*
+ * Check if the keyboard controller has a keypress for us.
+ * Some parts (Enter Release, LED change) are still blocking polled here,
+ * but hopefully they are all short.
+ */
+int kdb_get_kbd_char(void)
+{
+        int scancode, scanstatus;
+        static int shift_lock;  /* CAPS LOCK state (0-off, 1-on) */
+        static int shift_key;   /* Shift next keypress */
+        static int ctrl_key;
+        u_short keychar;
+        if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
+            (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
+                kbd_exists = 0;
+                return -1;
+        }
+        kbd_exists = 1;
+        if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+                return -1;
+        /*
+         * Fetch the scancode
+         */
+        scancode = inb(KBD_DATA_REG);
+        scanstatus = inb(KBD_STATUS_REG);
+        /*
+         * Ignore mouse events.
+         */
+        if (scanstatus & KBD_STAT_MOUSE_OBF)
+                return -1;
+        /*
+         * Ignore release, trigger on make
+         * (except for shift keys, where we want to
+         *  keep the shift state so long as the key is
+         *  held down).
+         */
+        if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
+                /*
+                 * Next key may use shift table
+                 */
+                if ((scancode & 0x80) == 0)
+                        shift_key = 1;
+                else
+                        shift_key = 0;
+                return -1;
+        }
+        if ((scancode&0x7f) == 0x1d) {
+                /*
+                 * Left ctrl key
+                 */
+                if ((scancode & 0x80) == 0)
+                        ctrl_key = 1;
+                else
+                        ctrl_key = 0;
+                return -1;
+        }
+        if ((scancode & 0x80) != 0)
+                return -1;
+        scancode &= 0x7f;
+        /*
+         * Translate scancode
+         */
+        if (scancode == 0x3a) {
+                /*
+                 * Toggle caps lock
+                 */
+                shift_lock ^= 1;
+#ifdef  KDB_BLINK_LED
+                kdb_toggleled(0x4);
+#endif
+                return -1;
+        }
+        if (scancode == 0x0e) {
+                /*
+                 * Backspace
+                 */
+                return 8;
+        }
+        /* Special Key */
+        switch (scancode) {
+        case 0xF: /* Tab */
+                return 9;
+        case 0x53: /* Del */
+                return 4;
+        case 0x47: /* Home */
+                return 1;
+        case 0x4F: /* End */
+                return 5;
+        case 0x4B: /* Left */
+                return 2;
+        case 0x48: /* Up */
+                return 16;
+        case 0x50: /* Down */
+                return 14;
+        case 0x4D: /* Right */
+                return 6;
+        }
+        if (scancode == 0xe0)
+                return -1;
+        /*
+         * For Japanese 86/106 keyboards
+         *      See comment in drivers/char/pc_keyb.c.
+         *      - Masahiro Adegawa
+         */
+        if (scancode == 0x73)
+                scancode = 0x59;
+        else if (scancode == 0x7d)
+                scancode = 0x7c;
+        if (!shift_lock && !shift_key && !ctrl_key) {
+                keychar = plain_map[scancode];
+        } else if ((shift_lock || shift_key) && key_maps[1]) {
+                keychar = key_maps[1][scancode];
+        } else if (ctrl_key && key_maps[4]) {
+                keychar = key_maps[4][scancode];
+        } else {
+                keychar = 0x0020;
+                kdb_printf("Unknown state/scancode (%d)\n", scancode);
+        }
+        keychar &= 0x0fff;
+        if (keychar == '\t')
+                keychar = ' ';
+        switch (KTYP(keychar)) {
+        case KT_LETTER:
+        case KT_LATIN:
+                if (isprint(keychar))
+                        break;          /* printable characters */
+                /* drop through */
+        case KT_SPEC:
+                if (keychar == K_ENTER)
+                        break;
+                /* drop through */
+        default:
+                return -1;      /* ignore unprintables */
+        }
+        if ((scancode & 0x7f) == 0x1c) {
+                /*
+                 * enter key.  All done.  Absorb the release scancode.
+                 */
+                while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
+                        ;
+                /*
+                 * Fetch the scancode
+                 */
+                scancode = inb(KBD_DATA_REG);
+                scanstatus = inb(KBD_STATUS_REG);
+                while (scanstatus & KBD_STAT_MOUSE_OBF) {
+                        scancode = inb(KBD_DATA_REG);
+                        scanstatus = inb(KBD_STATUS_REG);
+                }
+                if (scancode != 0x9c) {
+                        /*
+                         * Wasn't an enter-release,  why not?
+                         */
+                        kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
+                               scancode, scanstatus);
+                }
+                return 13;
+        }
+        return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..ebe4a287419e
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2846 @@
+/*
+ * Kernel Debugger Architecture Independent Main Code
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
+ * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/smp.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nmi.h>
+#include <linux/time.h>
+#include <linux/ptrace.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+#include <linux/kdebug.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+#define GREP_LEN 256
+char kdb_grep_string[GREP_LEN];
+int kdb_grepping_flag;
+EXPORT_SYMBOL(kdb_grepping_flag);
+int kdb_grep_leading;
+int kdb_grep_trailing;
+/*
+ * Kernel debugger state flags
+ */
+int kdb_flags;
+atomic_t kdb_event;
+/*
+ * kdb_lock protects updates to kdb_initial_cpu.  Used to
+ * single thread processors through the kernel debugger.
+ */
+int kdb_initial_cpu = -1;       /* cpu number that owns kdb */
+int kdb_nextline = 1;
+int kdb_state;                  /* General KDB state */
+struct task_struct *kdb_current_task;
+EXPORT_SYMBOL(kdb_current_task);
+struct pt_regs *kdb_current_regs;
+const char *kdb_diemsg;
+static int kdb_go_count;
+#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
+static unsigned int kdb_continue_catastrophic =
+        CONFIG_KDB_CONTINUE_CATASTROPHIC;
+#else
+static unsigned int kdb_continue_catastrophic;
+#endif
+/* kdb_commands describes the available commands. */
+static kdbtab_t *kdb_commands;
+#define KDB_BASE_CMD_MAX 50
+static int kdb_max_commands = KDB_BASE_CMD_MAX;
+static kdbtab_t kdb_base_commands[50];
+#define for_each_kdbcmd(cmd, num)                                       \
+        for ((cmd) = kdb_base_commands, (num) = 0;                      \
+             num < kdb_max_commands;                                    \
+             num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
+typedef struct _kdbmsg {
+        int     km_diag;        /* kdb diagnostic */
+        char    *km_msg;        /* Corresponding message text */
+} kdbmsg_t;
+#define KDBMSG(msgnum, text) \
+        { KDB_##msgnum, text }
+static kdbmsg_t kdbmsgs[] = {
+        KDBMSG(NOTFOUND, "Command Not Found"),
+        KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
+        KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
+               "8 is only allowed on 64 bit systems"),
+        KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
+        KDBMSG(NOTENV, "Cannot find environment variable"),
+        KDBMSG(NOENVVALUE, "Environment variable should have value"),
+        KDBMSG(NOTIMP, "Command not implemented"),
+        KDBMSG(ENVFULL, "Environment full"),
+        KDBMSG(ENVBUFFULL, "Environment buffer full"),
+        KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
+#ifdef CONFIG_CPU_XSCALE
+        KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
+#else
+        KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
+#endif
+        KDBMSG(DUPBPT, "Duplicate breakpoint address"),
+        KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
+        KDBMSG(BADMODE, "Invalid IDMODE"),
+        KDBMSG(BADINT, "Illegal numeric value"),
+        KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
+        KDBMSG(BADREG, "Invalid register name"),
+        KDBMSG(BADCPUNUM, "Invalid cpu number"),
+        KDBMSG(BADLENGTH, "Invalid length field"),
+        KDBMSG(NOBP, "No Breakpoint exists"),
+        KDBMSG(BADADDR, "Invalid address"),
+};
+#undef KDBMSG
+static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+/*
+ * Initial environment.   This is all kept static and local to
+ * this file.   We don't want to rely on the memory allocation
+ * mechanisms in the kernel, so we use a very limited allocate-only
+ * heap for new and altered environment variables.  The entire
+ * environment is limited to a fixed number of entries (add more
+ * to __env[] if required) and a fixed amount of heap (add more to
+ * KDB_ENVBUFSIZE if required).
+ */
+static char *__env[] = {
+#if defined(CONFIG_SMP)
+ "PROMPT=[%d]kdb> ",
+ "MOREPROMPT=[%d]more> ",
+#else
+ "PROMPT=kdb> ",
+ "MOREPROMPT=more> ",
+#endif
+ "RADIX=16",
+ "MDCOUNT=8",                   /* lines of md output */
+ "BTARGS=9",                    /* 9 possible args in bt */
+ KDB_PLATFORM_ENV,
+ "DTABCOUNT=30",
+ "NOSECT=1",
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+ (char *)0,
+};
+static const int __nenv = (sizeof(__env) / sizeof(char *));
+struct task_struct *kdb_curr_task(int cpu)
+{
+        struct task_struct *p = curr_task(cpu);
+#ifdef  _TIF_MCA_INIT
+        if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
+                p = krp->p;
+#endif
+        return p;
+}
+/*
+ * kdbgetenv - This function will return the character string value of
+ *      an environment variable.
+ * Parameters:
+ *      match   A character string representing an environment variable.
+ * Returns:
+ *      NULL    No environment variable matches 'match'
+ *      char*   Pointer to string value of environment variable.
+ */
+char *kdbgetenv(const char *match)
+{
+        char **ep = __env;
+        int matchlen = strlen(match);
+        int i;
+        for (i = 0; i < __nenv; i++) {
+                char *e = *ep++;
+                if (!e)
+                        continue;
+                if ((strncmp(match, e, matchlen) == 0)
+                 && ((e[matchlen] == '\0')
+                   || (e[matchlen] == '='))) {
+                        char *cp = strchr(e, '=');
+                        return cp ? ++cp : "";
+                }
+        }
+        return NULL;
+}
+/*
+ * kdballocenv - This function is used to allocate bytes for
+ *      environment entries.
+ * Parameters:
+ *      match   A character string representing a numeric value
+ * Outputs:
+ *      *value  the unsigned long representation of the env variable 'match'
+ * Returns:
+ *      Zero on success, a kdb diagnostic on failure.
+ * Remarks:
+ *      We use a static environment buffer (envbuffer) to hold the values
+ *      of dynamically generated environment variables (see kdb_set).  Buffer
+ *      space once allocated is never free'd, so over time, the amount of space
+ *      (currently 512 bytes) will be exhausted if env variables are changed
+ *      frequently.
+ */
+static char *kdballocenv(size_t bytes)
+{
+#define KDB_ENVBUFSIZE  512
+        static char envbuffer[KDB_ENVBUFSIZE];
+        static int envbufsize;
+        char *ep = NULL;
+        if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
+                ep = &envbuffer[envbufsize];
+                envbufsize += bytes;
+        }
+        return ep;
+}
+/*
+ * kdbgetulenv - This function will return the value of an unsigned
+ *      long-valued environment variable.
+ * Parameters:
+ *      match   A character string representing a numeric value
+ * Outputs:
+ *      *value  the unsigned long represntation of the env variable 'match'
+ * Returns:
+ *      Zero on success, a kdb diagnostic on failure.
+ */
+static int kdbgetulenv(const char *match, unsigned long *value)
+{
+        char *ep;
+        ep = kdbgetenv(match);
+        if (!ep)
+                return KDB_NOTENV;
+        if (strlen(ep) == 0)
+                return KDB_NOENVVALUE;
+        *value = simple_strtoul(ep, NULL, 0);
+        return 0;
+}
+/*
+ * kdbgetintenv - This function will return the value of an
+ *      integer-valued environment variable.
+ * Parameters:
+ *      match   A character string representing an integer-valued env variable
+ * Outputs:
+ *      *value  the integer representation of the environment variable 'match'
+ * Returns:
+ *      Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetintenv(const char *match, int *value)
+{
+        unsigned long val;
+        int diag;
+        diag = kdbgetulenv(match, &val);
+        if (!diag)
+                *value = (int) val;
+        return diag;
+}
+/*
+ * kdbgetularg - This function will convert a numeric string into an
+ *      unsigned long value.
+ * Parameters:
+ *      arg     A character string representing a numeric value
+ * Outputs:
+ *      *value  the unsigned long represntation of arg.
+ * Returns:
+ *      Zero on success, a kdb diagnostic on failure.
+ */
+int kdbgetularg(const char *arg, unsigned long *value)
+{
+        char *endp;
+        unsigned long val;
+        val = simple_strtoul(arg, &endp, 0);
+        if (endp == arg) {
+                /*
+                 * Try base 16, for us folks too lazy to type the
+                 * leading 0x...
+                 */
+                val = simple_strtoul(arg, &endp, 16);
+                if (endp == arg)
+                        return KDB_BADINT;
+        }
+        *value = val;
+        return 0;
+}
+/*
+ * kdb_set - This function implements the 'set' command.  Alter an
+ *      existing environment variable or create a new one.
+ */
+int kdb_set(int argc, const char **argv)
+{
+        int i;
+        char *ep;
+        size_t varlen, vallen;
+        /*
+         * we can be invoked two ways:
+         *   set var=value    argv[1]="var", argv[2]="value"
+         *   set var = value  argv[1]="var", argv[2]="=", argv[3]="value"
+         * - if the latter, shift 'em down.
+         */
+        if (argc == 3) {
+                argv[2] = argv[3];
+                argc--;
+        }
+        if (argc != 2)
+                return KDB_ARGCOUNT;
+        /*
+         * Check for internal variables
+         */
+        if (strcmp(argv[1], "KDBDEBUG") == 0) {
+                unsigned int debugflags;
+                char *cp;
+                debugflags = simple_strtoul(argv[2], &cp, 0);
+                if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
+                        kdb_printf("kdb: illegal debug flags '%s'\n",
+                                    argv[2]);
+                        return 0;
+                }
+                kdb_flags = (kdb_flags &
+                             ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
+                        | (debugflags << KDB_DEBUG_FLAG_SHIFT);
+                return 0;
+        }
+        /*
+         * Tokenizer squashed the '=' sign.  argv[1] is variable
+         * name, argv[2] = value.
+         */
+        varlen = strlen(argv[1]);
+        vallen = strlen(argv[2]);
+        ep = kdballocenv(varlen + vallen + 2);
+        if (ep == (char *)0)
+                return KDB_ENVBUFFULL;
+        sprintf(ep, "%s=%s", argv[1], argv[2]);
+        ep[varlen+vallen+1] = '\0';
+        for (i = 0; i < __nenv; i++) {
+                if (__env[i]
+                 && ((strncmp(__env[i], argv[1], varlen) == 0)
+                   && ((__env[i][varlen] == '\0')
+                    || (__env[i][varlen] == '=')))) {
+                        __env[i] = ep;
+                        return 0;
+                }
+        }
+        /*
+         * Wasn't existing variable.  Fit into slot.
+         */
+        for (i = 0; i < __nenv-1; i++) {
+                if (__env[i] == (char *)0) {
+                        __env[i] = ep;
+                        return 0;
+                }
+        }
+        return KDB_ENVFULL;
+}
+static int kdb_check_regs(void)
+{
+        if (!kdb_current_regs) {
+                kdb_printf("No current kdb registers."
+                           "  You may need to select another task\n");
+                return KDB_BADREG;
+        }
+        return 0;
+}
+/*
+ * kdbgetaddrarg - This function is responsible for parsing an
+ *      address-expression and returning the value of the expression,
+ *      symbol name, and offset to the caller.
+ *
+ *      The argument may consist of a numeric value (decimal or
+ *      hexidecimal), a symbol name, a register name (preceeded by the
+ *      percent sign), an environment variable with a numeric value
+ *      (preceeded by a dollar sign) or a simple arithmetic expression
+ *      consisting of a symbol name, +/-, and a numeric constant value
+ *      (offset).
+ * Parameters:
+ *      argc    - count of arguments in argv
+ *      argv    - argument vector
+ *      *nextarg - index to next unparsed argument in argv[]
+ *      regs    - Register state at time of KDB entry
+ * Outputs:
+ *      *value  - receives the value of the address-expression
+ *      *offset - receives the offset specified, if any
+ *      *name   - receives the symbol name, if any
+ *      *nextarg - index to next unparsed argument in argv[]
+ * Returns:
+ *      zero is returned on success, a kdb diagnostic code is
+ *      returned on error.
+ */
+int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
+                  unsigned long *value,  long *offset,
+                  char **name)
+{
+        unsigned long addr;
+        unsigned long off = 0;
+        int positive;
+        int diag;
+        int found = 0;
+        char *symname;
+        char symbol = '\0';
+        char *cp;
+        kdb_symtab_t symtab;
+        /*
+         * Process arguments which follow the following syntax:
+         *
+         *  symbol | numeric-address [+/- numeric-offset]
+         *  %register
+         *  $environment-variable
+         */
+        if (*nextarg > argc)
+                return KDB_ARGCOUNT;
+        symname = (char *)argv[*nextarg];
+        /*
+         * If there is no whitespace between the symbol
+         * or address and the '+' or '-' symbols, we
+         * remember the character and replace it with a
+         * null so the symbol/value can be properly parsed
+         */
+        cp = strpbrk(symname, "+-");
+        if (cp != NULL) {
+                symbol = *cp;
+                *cp++ = '\0';
+        }
+        if (symname[0] == '$') {
+                diag = kdbgetulenv(&symname[1], &addr);
+                if (diag)
+                        return diag;
+        } else if (symname[0] == '%') {
+                diag = kdb_check_regs();
+                if (diag)
+                        return diag;
+                /* Implement register values with % at a later time as it is
+                 * arch optional.
+                 */
+                return KDB_NOTIMP;
+        } else {
+                found = kdbgetsymval(symname, &symtab);
+                if (found) {
+                        addr = symtab.sym_start;
+                } else {
+                        diag = kdbgetularg(argv[*nextarg], &addr);
+                        if (diag)
+                                return diag;
+                }
+        }
+        if (!found)
+                found = kdbnearsym(addr, &symtab);
+        (*nextarg)++;
+        if (name)
+                *name = symname;
+        if (value)
+                *value = addr;
+        if (offset && name && *name)
+                *offset = addr - symtab.sym_start;
+        if ((*nextarg > argc)
+         && (symbol == '\0'))
+                return 0;
+        /*
+         * check for +/- and offset
+         */
+        if (symbol == '\0') {
+                if ((argv[*nextarg][0] != '+')
+                 && (argv[*nextarg][0] != '-')) {
+                        /*
+                         * Not our argument.  Return.
+                         */
+                        return 0;
+                } else {
+                        positive = (argv[*nextarg][0] == '+');
+                        (*nextarg)++;
+                }
+        } else
+                positive = (symbol == '+');
+        /*
+         * Now there must be an offset!
+         */
+        if ((*nextarg > argc)
+         && (symbol == '\0')) {
+                return KDB_INVADDRFMT;
+        }
+        if (!symbol) {
+                cp = (char *)argv[*nextarg];
+                (*nextarg)++;
+        }
+        diag = kdbgetularg(cp, &off);
+        if (diag)
+                return diag;
+        if (!positive)
+                off = -off;
+        if (offset)
+                *offset += off;
+        if (value)
+                *value += off;
+        return 0;
+}
+static void kdb_cmderror(int diag)
+{
+        int i;
+        if (diag >= 0) {
+                kdb_printf("no error detected (diagnostic is %d)\n", diag);
+                return;
+        }
+        for (i = 0; i < __nkdb_err; i++) {
+                if (kdbmsgs[i].km_diag == diag) {
+                        kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
+                        return;
+                }
+        }
+        kdb_printf("Unknown diag %d\n", -diag);
+}
+/*
+ * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
+ *      command which defines one command as a set of other commands,
+ *      terminated by endefcmd.  kdb_defcmd processes the initial
+ *      'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
+ *      the following commands until 'endefcmd'.
+ * Inputs:
+ *      argc    argument count
+ *      argv    argument vector
+ * Returns:
+ *      zero for success, a kdb diagnostic if error
+ */
+struct defcmd_set {
+        int count;
+        int usable;
+        char *name;
+        char *usage;
+        char *help;
+        char **command;
+};
+static struct defcmd_set *defcmd_set;
+static int defcmd_set_count;
+static int defcmd_in_progress;
+/* Forward references */
+static int kdb_exec_defcmd(int argc, const char **argv);
+static int kdb_defcmd2(const char *cmdstr, const char *argv0)
+{
+        struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
+        char **save_command = s->command;
+        if (strcmp(argv0, "endefcmd") == 0) {
+                defcmd_in_progress = 0;
+                if (!s->count)
+                        s->usable = 0;
+                if (s->usable)
+                        kdb_register(s->name, kdb_exec_defcmd,
+                                     s->usage, s->help, 0);
+                return 0;
+        }
+        if (!s->usable)
+                return KDB_NOTIMP;
+        s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
+        if (!s->command) {
+                kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
+                           cmdstr);
+                s->usable = 0;
+                return KDB_NOTIMP;
+        }
+        memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
+        s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
+        kfree(save_command);
+        return 0;
+}
+static int kdb_defcmd(int argc, const char **argv)
+{
+        struct defcmd_set *save_defcmd_set = defcmd_set, *s;
+        if (defcmd_in_progress) {
+                kdb_printf("kdb: nested defcmd detected, assuming missing "
+                           "endefcmd\n");
+                kdb_defcmd2("endefcmd", "endefcmd");
+        }
+        if (argc == 0) {
+                int i;
+                for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
+                        kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
+                                   s->usage, s->help);
+                        for (i = 0; i < s->count; ++i)
+                                kdb_printf("%s", s->command[i]);
+                        kdb_printf("endefcmd\n");
+                }
+                return 0;
+        }
+        if (argc != 3)
+                return KDB_ARGCOUNT;
+        defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+                             GFP_KDB);
+        if (!defcmd_set) {
+                kdb_printf("Could not allocate new defcmd_set entry for %s\n",
+                           argv[1]);
+                defcmd_set = save_defcmd_set;
+                return KDB_NOTIMP;
+        }
+        memcpy(defcmd_set, save_defcmd_set,
+               defcmd_set_count * sizeof(*defcmd_set));
+        kfree(save_defcmd_set);
+        s = defcmd_set + defcmd_set_count;
+        memset(s, 0, sizeof(*s));
+        s->usable = 1;
+        s->name = kdb_strdup(argv[1], GFP_KDB);
+        s->usage = kdb_strdup(argv[2], GFP_KDB);
+        s->help = kdb_strdup(argv[3], GFP_KDB);
+        if (s->usage[0] == '"') {
+                strcpy(s->usage, s->usage+1);
+                s->usage[strlen(s->usage)-1] = '\0';
+        }
+        if (s->help[0] == '"') {
+                strcpy(s->help, s->help+1);
+                s->help[strlen(s->help)-1] = '\0';
+        }
+        ++defcmd_set_count;
+        defcmd_in_progress = 1;
+        return 0;
+}
+/*
+ * kdb_exec_defcmd - Execute the set of commands associated with this
+ *      defcmd name.
+ * Inputs:
+ *      argc    argument count
+ *      argv    argument vector
+ * Returns:
+ *      zero for success, a kdb diagnostic if error
+ */
+static int kdb_exec_defcmd(int argc, const char **argv)
+{
+        int i, ret;
+        struct defcmd_set *s;
+        if (argc != 0)
+                return KDB_ARGCOUNT;
+        for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
+                if (strcmp(s->name, argv[0]) == 0)
+                        break;
+        }
+        if (i == defcmd_set_count) {
+                kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
+                           argv[0]);
+                return KDB_NOTIMP;
+        }
+        for (i = 0; i < s->count; ++i) {
+                /* Recursive use of kdb_parse, do not use argv after
+                 * this point */
+                argv = NULL;
+                kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
+                ret = kdb_parse(s->command[i]);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+/* Command history */
+#define KDB_CMD_HISTORY_COUNT   32
+#define CMD_BUFLEN              200     /* kdb_printf: max printline
+                                         * size == 256 */
+static unsigned int cmd_head, cmd_tail;
+static unsigned int cmdptr;
+static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
+static char cmd_cur[CMD_BUFLEN];
+/*
+ * The "str" argument may point to something like  | grep xyz
+ */
+static void parse_grep(const char *str)
+{
+        int     len;
+        char    *cp = (char *)str, *cp2;
+        /* sanity check: we should have been called with the \ first */
+        if (*cp != '|')
+                return;
+        cp++;
+        while (isspace(*cp))
+                cp++;
+        if (strncmp(cp, "grep ", 5)) {
+                kdb_printf("invalid 'pipe', see grephelp\n");
+                return;
+        }
+        cp += 5;
+        while (isspace(*cp))
+                cp++;
+        cp2 = strchr(cp, '\n');
+        if (cp2)
+                *cp2 = '\0'; /* remove the trailing newline */
+        len = strlen(cp);
+        if (len == 0) {
+                kdb_printf("invalid 'pipe', see grephelp\n");
+                return;
+        }
+        /* now cp points to a nonzero length search string */
+        if (*cp == '"') {
+                /* allow it be "x y z" by removing the "'s - there must
+                   be two of them */
+                cp++;
+                cp2 = strchr(cp, '"');
+                if (!cp2) {
+                        kdb_printf("invalid quoted string, see grephelp\n");
+                        return;
+                }
+                *cp2 = '\0'; /* end the string where the 2nd " was */
+        }
+        kdb_grep_leading = 0;
+        if (*cp == '^') {
+                kdb_grep_leading = 1;
+                cp++;
+        }
+        len = strlen(cp);
+        kdb_grep_trailing = 0;
+        if (*(cp+len-1) == '$') {
+                kdb_grep_trailing = 1;
+                *(cp+len-1) = '\0';
+        }
+        len = strlen(cp);
+        if (!len)
+                return;
+        if (len >= GREP_LEN) {
+                kdb_printf("search string too long\n");
+                return;
+        }
+        strcpy(kdb_grep_string, cp);
+        kdb_grepping_flag++;
+        return;
+}
+/*
+ * kdb_parse - Parse the command line, search the command table for a
+ *      matching command and invoke the command function.  This
+ *      function may be called recursively, if it is, the second call
+ *      will overwrite argv and cbuf.  It is the caller's
+ *      responsibility to save their argv if they recursively call
+ *      kdb_parse().
+ * Parameters:
+ *      cmdstr  The input command line to be parsed.
+ *      regs    The registers at the time kdb was entered.
+ * Returns:
+ *      Zero for success, a kdb diagnostic if failure.
+ * Remarks:
+ *      Limited to 20 tokens.
+ *
+ *      Real rudimentary tokenization. Basically only whitespace
+ *      is considered a token delimeter (but special consideration
+ *      is taken of the '=' sign as used by the 'set' command).
+ *
+ *      The algorithm used to tokenize the input string relies on
+ *      there being at least one whitespace (or otherwise useless)
+ *      character between tokens as the character immediately following
+ *      the token is altered in-place to a null-byte to terminate the
+ *      token string.
+ */
+#define MAXARGC 20
+int kdb_parse(const char *cmdstr)
+{
+        static char *argv[MAXARGC];
+        static int argc;
+        static char cbuf[CMD_BUFLEN+2];
+        char *cp;
+        char *cpp, quoted;
+        kdbtab_t *tp;
+        int i, escaped, ignore_errors = 0, check_grep;
+        /*
+         * First tokenize the command string.
+         */
+        cp = (char *)cmdstr;
+        kdb_grepping_flag = check_grep = 0;
+        if (KDB_FLAG(CMD_INTERRUPT)) {
+                /* Previous command was interrupted, newline must not
+                 * repeat the command */
+                KDB_FLAG_CLEAR(CMD_INTERRUPT);
+                KDB_STATE_SET(PAGER);
+                argc = 0;       /* no repeat */
+        }
+        if (*cp != '\n' && *cp != '\0') {
+                argc = 0;
+                cpp = cbuf;
+                while (*cp) {
+                        /* skip whitespace */
+                        while (isspace(*cp))
+                                cp++;
+                        if ((*cp == '\0') || (*cp == '\n') ||
+                            (*cp == '#' && !defcmd_in_progress))
+                                break;
+                        /* special case: check for | grep pattern */
+                        if (*cp == '|') {
+                                check_grep++;
+                                break;
+                        }
+                        if (cpp >= cbuf + CMD_BUFLEN) {
+                                kdb_printf("kdb_parse: command buffer "
+                                           "overflow, command ignored\n%s\n",
+                                           cmdstr);
+                                return KDB_NOTFOUND;
+                        }
+                        if (argc >= MAXARGC - 1) {
+                                kdb_printf("kdb_parse: too many arguments, "
+                                           "command ignored\n%s\n", cmdstr);
+                                return KDB_NOTFOUND;
+                        }
+                        argv[argc++] = cpp;
+                        escaped = 0;
+                        quoted = '\0';
+                        /* Copy to next unquoted and unescaped
+                         * whitespace or '=' */
+                        while (*cp && *cp != '\n' &&
+                               (escaped || quoted || !isspace(*cp))) {
+                                if (cpp >= cbuf + CMD_BUFLEN)
+                                        break;
+                                if (escaped) {
+                                        escaped = 0;
+                                        *cpp++ = *cp++;
+                                        continue;
+                                }
+                                if (*cp == '\\') {
+                                        escaped = 1;
+                                        ++cp;
+                                        continue;
+                                }
+                                if (*cp == quoted)
+                                        quoted = '\0';
+                                else if (*cp == '\'' || *cp == '"')
+                                        quoted = *cp;
+                                *cpp = *cp++;
+                                if (*cpp == '=' && !quoted)
+                                        break;
+                                ++cpp;
+                        }
+                        *cpp++ = '\0';  /* Squash a ws or '=' character */
+                }
+        }
+        if (!argc)
+                return 0;
+        if (check_grep)
+                parse_grep(cp);
+        if (defcmd_in_progress) {
+                int result = kdb_defcmd2(cmdstr, argv[0]);
+                if (!defcmd_in_progress) {
+                        argc = 0;       /* avoid repeat on endefcmd */
+                        *(argv[0]) = '\0';
+                }
+                return result;
+        }
+        if (argv[0][0] == '-' && argv[0][1] &&
+            (argv[0][1] < '0' || argv[0][1] > '9')) {
+                ignore_errors = 1;
+                ++argv[0];
+        }
+        for_each_kdbcmd(tp, i) {
+                if (tp->cmd_name) {
+                        /*
+                         * If this command is allowed to be abbreviated,
+                         * check to see if this is it.
+                         */
+                        if (tp->cmd_minlen
+                         && (strlen(argv[0]) <= tp->cmd_minlen)) {
+                                if (strncmp(argv[0],
+                                            tp->cmd_name,
+                                            tp->cmd_minlen) == 0) {
+                                        break;
+                                }
+                        }
+                        if (strcmp(argv[0], tp->cmd_name) == 0)
+                                break;
+                }
+        }
+        /*
+         * If we don't find a command by this name, see if the first
+         * few characters of this match any of the known commands.
+         * e.g., md1c20 should match md.
+         */
+        if (i == kdb_max_commands) {
+                for_each_kdbcmd(tp, i) {
+                        if (tp->cmd_name) {
+                                if (strncmp(argv[0],
+                                            tp->cmd_name,
+                                            strlen(tp->cmd_name)) == 0) {
+                                        break;
+                                }
+                        }
+                }
+        }
+        if (i < kdb_max_commands) {
+                int result;
+                KDB_STATE_SET(CMD);
+                result = (*tp->cmd_func)(argc-1, (const char **)argv);
+                if (result && ignore_errors && result > KDB_CMD_GO)
+                        result = 0;
+                KDB_STATE_CLEAR(CMD);
+                switch (tp->cmd_repeat) {
+                case KDB_REPEAT_NONE:
+                        argc = 0;
+                        if (argv[0])
+                                *(argv[0]) = '\0';
+                        break;
+                case KDB_REPEAT_NO_ARGS:
+                        argc = 1;
+                        if (argv[1])
+                                *(argv[1]) = '\0';
+                        break;
+                case KDB_REPEAT_WITH_ARGS:
+                        break;
+                }
+                return result;
+        }
+        /*
+         * If the input with which we were presented does not
+         * map to an existing command, attempt to parse it as an
+         * address argument and display the result.   Useful for
+         * obtaining the address of a variable, or the nearest symbol
+         * to an address contained in a register.
+         */
+        {
+                unsigned long value;
+                char *name = NULL;
+                long offset;
+                int nextarg = 0;
+                if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
+                                  &value, &offset, &name)) {
+                        return KDB_NOTFOUND;
+                }
+                kdb_printf("%s = ", argv[0]);
+                kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
+                kdb_printf("\n");
+                return 0;
+        }
+}
+static int handle_ctrl_cmd(char *cmd)
+{
+#define CTRL_P  16
+#define CTRL_N  14
+        /* initial situation */
+        if (cmd_head == cmd_tail)
+                return 0;
+        switch (*cmd) {
+        case CTRL_P:
+                if (cmdptr != cmd_tail)
+                        cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
+                strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+                return 1;
+        case CTRL_N:
+                if (cmdptr != cmd_head)
+                        cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
+                strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * kdb_reboot - This function implements the 'reboot' command.  Reboot
+ *      the system immediately, or loop for ever on failure.
+ */
+static int kdb_reboot(int argc, const char **argv)
+{
+        emergency_restart();
+        kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
+        while (1)
+                cpu_relax();
+        /* NOTREACHED */
+        return 0;
+}
+static void kdb_dumpregs(struct pt_regs *regs)
+{
+        int old_lvl = console_loglevel;
+        console_loglevel = 15;
+        kdb_trap_printk++;
+        show_regs(regs);
+        kdb_trap_printk--;
+        kdb_printf("\n");
+        console_loglevel = old_lvl;
+}
+void kdb_set_current_task(struct task_struct *p)
+{
+        kdb_current_task = p;
+        if (kdb_task_has_cpu(p)) {
+                kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
+                return;
+        }
+        kdb_current_regs = NULL;
+}
+/*
+ * kdb_local - The main code for kdb.  This routine is invoked on a
+ *      specific processor, it is not global.  The main kdb() routine
+ *      ensures that only one processor at a time is in this routine.
+ *      This code is called with the real reason code on the first
+ *      entry to a kdb session, thereafter it is called with reason
+ *      SWITCH, even if the user goes back to the original cpu.
+ * Inputs:
+ *      reason          The reason KDB was invoked
+ *      error           The hardware-defined error code
+ *      regs            The exception frame at time of fault/breakpoint.
+ *      db_result       Result code from the break or debug point.
+ * Returns:
+ *      0       KDB was invoked for an event which it wasn't responsible
+ *      1       KDB handled the event for which it was invoked.
+ *      KDB_CMD_GO      User typed 'go'.
+ *      KDB_CMD_CPU     User switched to another cpu.
+ *      KDB_CMD_SS      Single step.
+ *      KDB_CMD_SSB     Single step until branch.
+ */
+static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
+                     kdb_dbtrap_t db_result)
+{
+        char *cmdbuf;
+        int diag;
+        struct task_struct *kdb_current =
+                kdb_curr_task(raw_smp_processor_id());
+        KDB_DEBUG_STATE("kdb_local 1", reason);
+        kdb_go_count = 0;
+        if (reason == KDB_REASON_DEBUG) {
+                /* special case below */
+        } else {
+                kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+                           kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+                kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+        }
+        switch (reason) {
+        case KDB_REASON_DEBUG:
+        {
+                /*
+                 * If re-entering kdb after a single step
+                 * command, don't print the message.
+                 */
+                switch (db_result) {
+                case KDB_DB_BPT:
+                        kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+                                   kdb_current, kdb_current->pid);
+#if defined(CONFIG_SMP)
+                        kdb_printf("on processor %d ", raw_smp_processor_id());
+#endif
+                        kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
+                                   instruction_pointer(regs));
+                        break;
+                case KDB_DB_SSB:
+                        /*
+                         * In the midst of ssb command. Just return.
+                         */
+                        KDB_DEBUG_STATE("kdb_local 3", reason);
+                        return KDB_CMD_SSB;     /* Continue with SSB command */
+                        break;
+                case KDB_DB_SS:
+                        break;
+                case KDB_DB_SSBPT:
+                        KDB_DEBUG_STATE("kdb_local 4", reason);
+                        return 1;       /* kdba_db_trap did the work */
+                default:
+                        kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
+                                   db_result);
+                        break;
+                }
+        }
+                break;
+        case KDB_REASON_ENTER:
+                if (KDB_STATE(KEYBOARD))
+                        kdb_printf("due to Keyboard Entry\n");
+                else
+                        kdb_printf("due to KDB_ENTER()\n");
+                break;
+        case KDB_REASON_KEYBOARD:
+                KDB_STATE_SET(KEYBOARD);
+                kdb_printf("due to Keyboard Entry\n");
+                break;
+        case KDB_REASON_ENTER_SLAVE:
+                /* drop through, slaves only get released via cpu switch */
+        case KDB_REASON_SWITCH:
+                kdb_printf("due to cpu switch\n");
+                break;
+        case KDB_REASON_OOPS:
+                kdb_printf("Oops: %s\n", kdb_diemsg);
+                kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
+                           instruction_pointer(regs));
+                kdb_dumpregs(regs);
+                break;
+        case KDB_REASON_NMI:
+                kdb_printf("due to NonMaskable Interrupt @ "
+                           kdb_machreg_fmt "\n",
+                           instruction_pointer(regs));
+                kdb_dumpregs(regs);
+                break;
+        case KDB_REASON_SSTEP:
+        case KDB_REASON_BREAK:
+                kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
+                           reason == KDB_REASON_BREAK ?
+                           "Breakpoint" : "SS trap", instruction_pointer(regs));
+                /*
+                 * Determine if this breakpoint is one that we
+                 * are interested in.
+                 */
+                if (db_result != KDB_DB_BPT) {
+                        kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
+                                   db_result);
+                        KDB_DEBUG_STATE("kdb_local 6", reason);
+                        return 0;       /* Not for us, dismiss it */
+                }
+                break;
+        case KDB_REASON_RECURSE:
+                kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
+                           instruction_pointer(regs));
+                break;
+        default:
+                kdb_printf("kdb: unexpected reason code: %d\n", reason);
+                KDB_DEBUG_STATE("kdb_local 8", reason);
+                return 0;       /* Not for us, dismiss it */
+        }
+        while (1) {
+                /*
+                 * Initialize pager context.
+                 */
+                kdb_nextline = 1;
+                KDB_STATE_CLEAR(SUPPRESS);
+                cmdbuf = cmd_cur;
+                *cmdbuf = '\0';
+                *(cmd_hist[cmd_head]) = '\0';
+                if (KDB_FLAG(ONLY_DO_DUMP)) {
+                        /* kdb is off but a catastrophic error requires a dump.
+                         * Take the dump and reboot.
+                         * Turn on logging so the kdb output appears in the log
+                         * buffer in the dump.
+                         */
+                        const char *setargs[] = { "set", "LOGGING", "1" };
+                        kdb_set(2, setargs);
+                        kdb_reboot(0, NULL);
+                        /*NOTREACHED*/
+                }
+do_full_getstr:
+#if defined(CONFIG_SMP)
+                snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
+                         raw_smp_processor_id());
+#else
+                snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
+#endif
+                if (defcmd_in_progress)
+                        strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
+                /*
+                 * Fetch command from keyboard
+                 */
+                cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
+                if (*cmdbuf != '\n') {
+                        if (*cmdbuf < 32) {
+                                if (cmdptr == cmd_head) {
+                                        strncpy(cmd_hist[cmd_head], cmd_cur,
+                                                CMD_BUFLEN);
+                                        *(cmd_hist[cmd_head] +
+                                          strlen(cmd_hist[cmd_head])-1) = '\0';
+                                }
+                                if (!handle_ctrl_cmd(cmdbuf))
+                                        *(cmd_cur+strlen(cmd_cur)-1) = '\0';
+                                cmdbuf = cmd_cur;
+                                goto do_full_getstr;
+                        } else {
+                                strncpy(cmd_hist[cmd_head], cmd_cur,
+                                        CMD_BUFLEN);
+                        }
+                        cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
+                        if (cmd_head == cmd_tail)
+                                cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
+                }
+                cmdptr = cmd_head;
+                diag = kdb_parse(cmdbuf);
+                if (diag == KDB_NOTFOUND) {
+                        kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
+                        diag = 0;
+                }
+                if (diag == KDB_CMD_GO
+                 || diag == KDB_CMD_CPU
+                 || diag == KDB_CMD_SS
+                 || diag == KDB_CMD_SSB
+                 || diag == KDB_CMD_KGDB)
+                        break;
+                if (diag)
+                        kdb_cmderror(diag);
+        }
+        KDB_DEBUG_STATE("kdb_local 9", diag);
+        return diag;
+}
+/*
+ * kdb_print_state - Print the state data for the current processor
+ *      for debugging.
+ * Inputs:
+ *      text            Identifies the debug point
+ *      value           Any integer value to be printed, e.g. reason code.
+ */
+void kdb_print_state(const char *text, int value)
+{
+        kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
+                   text, raw_smp_processor_id(), value, kdb_initial_cpu,
+                   kdb_state);
+}
+/*
+ * kdb_main_loop - After initial setup and assignment of the
+ *      controlling cpu, all cpus are in this loop.  One cpu is in
+ *      control and will issue the kdb prompt, the others will spin
+ *      until 'go' or cpu switch.
+ *
+ *      To get a consistent view of the kernel stacks for all
+ *      processes, this routine is invoked from the main kdb code via
+ *      an architecture specific routine.  kdba_main_loop is
+ *      responsible for making the kernel stacks consistent for all
+ *      processes, there should be no difference between a blocked
+ *      process and a running process as far as kdb is concerned.
+ * Inputs:
+ *      reason          The reason KDB was invoked
+ *      error           The hardware-defined error code
+ *      reason2         kdb's current reason code.
+ *                      Initially error but can change
+ *                      acording to kdb state.
+ *      db_result       Result code from break or debug point.
+ *      regs            The exception frame at time of fault/breakpoint.
+ *                      should always be valid.
+ * Returns:
+ *      0       KDB was invoked for an event which it wasn't responsible
+ *      1       KDB handled the event for which it was invoked.
+ */
+int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
+              kdb_dbtrap_t db_result, struct pt_regs *regs)
+{
+        int result = 1;
+        /* Stay in kdb() until 'go', 'ss[b]' or an error */
+        while (1) {
+                /*
+                 * All processors except the one that is in control
+                 * will spin here.
+                 */
+                KDB_DEBUG_STATE("kdb_main_loop 1", reason);
+                while (KDB_STATE(HOLD_CPU)) {
+                        /* state KDB is turned off by kdb_cpu to see if the
+                         * other cpus are still live, each cpu in this loop
+                         * turns it back on.
+                         */
+                        if (!KDB_STATE(KDB))
+                                KDB_STATE_SET(KDB);
+                }
+                KDB_STATE_CLEAR(SUPPRESS);
+                KDB_DEBUG_STATE("kdb_main_loop 2", reason);
+                if (KDB_STATE(LEAVING))
+                        break;  /* Another cpu said 'go' */
+                /* Still using kdb, this processor is in control */
+                result = kdb_local(reason2, error, regs, db_result);
+                KDB_DEBUG_STATE("kdb_main_loop 3", result);
+                if (result == KDB_CMD_CPU)
+                        break;
+                if (result == KDB_CMD_SS) {
+                        KDB_STATE_SET(DOING_SS);
+                        break;
+                }
+                if (result == KDB_CMD_SSB) {
+                        KDB_STATE_SET(DOING_SS);
+                        KDB_STATE_SET(DOING_SSB);
+                        break;
+                }
+                if (result == KDB_CMD_KGDB) {
+                        if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
+                                kdb_printf("Entering please attach debugger "
+                                           "or use $D#44+ or $3#33\n");
+                        break;
+                }
+                if (result && result != 1 && result != KDB_CMD_GO)
+                        kdb_printf("\nUnexpected kdb_local return code %d\n",
+                                   result);
+                KDB_DEBUG_STATE("kdb_main_loop 4", reason);
+                break;
+        }
+        if (KDB_STATE(DOING_SS))
+                KDB_STATE_CLEAR(SSBPT);
+        return result;
+}
+/*
+ * kdb_mdr - This function implements the guts of the 'mdr', memory
+ * read command.
+ *      mdr  <addr arg>,<byte count>
+ * Inputs:
+ *      addr    Start address
+ *      count   Number of bytes
+ * Returns:
+ *      Always 0.  Any errors are detected and printed by kdb_getarea.
+ */
+static int kdb_mdr(unsigned long addr, unsigned int count)
+{
+        unsigned char c;
+        while (count--) {
+                if (kdb_getarea(c, addr))
+                        return 0;
+                kdb_printf("%02x", c);
+                addr++;
+        }
+        kdb_printf("\n");
+        return 0;
+}
+/*
+ * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
+ *      'md8' 'mdr' and 'mds' commands.
+ *
+ *      md|mds  [<addr arg> [<line count> [<radix>]]]
+ *      mdWcN   [<addr arg> [<line count> [<radix>]]]
+ *              where W = is the width (1, 2, 4 or 8) and N is the count.
+ *              for eg., md1c20 reads 20 bytes, 1 at a time.
+ *      mdr  <addr arg>,<byte count>
+ */
+static void kdb_md_line(const char *fmtstr, unsigned long addr,
+                        int symbolic, int nosect, int bytesperword,
+                        int num, int repeat, int phys)
+{
+        /* print just one line of data */
+        kdb_symtab_t symtab;
+        char cbuf[32];
+        char *c = cbuf;
+        int i;
+        unsigned long word;
+        memset(cbuf, '\0', sizeof(cbuf));
+        if (phys)
+                kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
+        else
+                kdb_printf(kdb_machreg_fmt0 " ", addr);
+        for (i = 0; i < num && repeat--; i++) {
+                if (phys) {
+                        if (kdb_getphysword(&word, addr, bytesperword))
+                                break;
+                } else if (kdb_getword(&word, addr, bytesperword))
+                        break;
+                kdb_printf(fmtstr, word);
+                if (symbolic)
+                        kdbnearsym(word, &symtab);
+                else
+                        memset(&symtab, 0, sizeof(symtab));
+                if (symtab.sym_name) {
+                        kdb_symbol_print(word, &symtab, 0);
+                        if (!nosect) {
+                                kdb_printf("\n");
+                                kdb_printf("                       %s %s "
+                                           kdb_machreg_fmt " "
+                                           kdb_machreg_fmt " "
+                                           kdb_machreg_fmt, symtab.mod_name,
+                                           symtab.sec_name, symtab.sec_start,
+                                           symtab.sym_start, symtab.sym_end);
+                        }
+                        addr += bytesperword;
+                } else {
+                        union {
+                                u64 word;
+                                unsigned char c[8];
+                        } wc;
+                        unsigned char *cp;
+#ifdef  __BIG_ENDIAN
+                        cp = wc.c + 8 - bytesperword;
+#else
+                        cp = wc.c;
+#endif
+                        wc.word = word;
+#define printable_char(c) \
+        ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
+                        switch (bytesperword) {
+                        case 8:
+                                *c++ = printable_char(*cp++);
+                                *c++ = printable_char(*cp++);
+                                *c++ = printable_char(*cp++);
+                                *c++ = printable_char(*cp++);
+                                addr += 4;
+                        case 4:
+                                *c++ = printable_char(*cp++);
+                                *c++ = printable_char(*cp++);
+                                addr += 2;
+                        case 2:
+                                *c++ = printable_char(*cp++);
+                                addr++;
+                        case 1:
+                                *c++ = printable_char(*cp++);
+                                addr++;
+                                break;
+                        }
+#undef printable_char
+                }
+        }
+        kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
+                   " ", cbuf);
+}
+static int kdb_md(int argc, const char **argv)
+{
+        static unsigned long last_addr;
+        static int last_radix, last_bytesperword, last_repeat;
+        int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
+        int nosect = 0;
+        char fmtchar, fmtstr[64];
+        unsigned long addr;
+        unsigned long word;
+        long offset = 0;
+        int symbolic = 0;
+        int valid = 0;
+        int phys = 0;
+        kdbgetintenv("MDCOUNT", &mdcount);
+        kdbgetintenv("RADIX", &radix);
+        kdbgetintenv("BYTESPERWORD", &bytesperword);
+        /* Assume 'md <addr>' and start with environment values */
+        repeat = mdcount * 16 / bytesperword;
+        if (strcmp(argv[0], "mdr") == 0) {
+                if (argc != 2)
+                        return KDB_ARGCOUNT;
+                valid = 1;
+        } else if (isdigit(argv[0][2])) {
+                bytesperword = (int)(argv[0][2] - '0');
+                if (bytesperword == 0) {
+                        bytesperword = last_bytesperword;
+                        if (bytesperword == 0)
+                                bytesperword = 4;
+                }
+                last_bytesperword = bytesperword;
+                repeat = mdcount * 16 / bytesperword;
+                if (!argv[0][3])
+                        valid = 1;
+                else if (argv[0][3] == 'c' && argv[0][4]) {
+                        char *p;
+                        repeat = simple_strtoul(argv[0] + 4, &p, 10);
+                        mdcount = ((repeat * bytesperword) + 15) / 16;
+                        valid = !*p;
+                }
+                last_repeat = repeat;
+        } else if (strcmp(argv[0], "md") == 0)
+                valid = 1;
+        else if (strcmp(argv[0], "mds") == 0)
+                valid = 1;
+        else if (strcmp(argv[0], "mdp") == 0) {
+                phys = valid = 1;
+        }
+        if (!valid)
+                return KDB_NOTFOUND;
+        if (argc == 0) {
+                if (last_addr == 0)
+                        return KDB_ARGCOUNT;
+                addr = last_addr;
+                radix = last_radix;
+                bytesperword = last_bytesperword;
+                repeat = last_repeat;
+                mdcount = ((repeat * bytesperword) + 15) / 16;
+        }
+        if (argc) {
+                unsigned long val;
+                int diag, nextarg = 1;
+                diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
+                                     &offset, NULL);
+                if (diag)
+                        return diag;
+                if (argc > nextarg+2)
+                        return KDB_ARGCOUNT;
+                if (argc >= nextarg) {
+                        diag = kdbgetularg(argv[nextarg], &val);
+                        if (!diag) {
+                                mdcount = (int) val;
+                                repeat = mdcount * 16 / bytesperword;
+                        }
+                }
+                if (argc >= nextarg+1) {
+                        diag = kdbgetularg(argv[nextarg+1], &val);
+                        if (!diag)
+                                radix = (int) val;
+                }
+        }
+        if (strcmp(argv[0], "mdr") == 0)
+                return kdb_mdr(addr, mdcount);
+        switch (radix) {
+        case 10:
+                fmtchar = 'd';
+                break;
+        case 16:
+                fmtchar = 'x';
+                break;
+        case 8:
+                fmtchar = 'o';
+                break;
+        default:
+                return KDB_BADRADIX;
+        }
+        last_radix = radix;
+        if (bytesperword > KDB_WORD_SIZE)
+                return KDB_BADWIDTH;
+        switch (bytesperword) {
+        case 8:
+                sprintf(fmtstr, "%%16.16l%c ", fmtchar);
+                break;
+        case 4:
+                sprintf(fmtstr, "%%8.8l%c ", fmtchar);
+                break;
+        case 2:
+                sprintf(fmtstr, "%%4.4l%c ", fmtchar);
+                break;
+        case 1:
+                sprintf(fmtstr, "%%2.2l%c ", fmtchar);
+                break;
+        default:
+                return KDB_BADWIDTH;
+        }
+        last_repeat = repeat;
+        last_bytesperword = bytesperword;
+        if (strcmp(argv[0], "mds") == 0) {
+                symbolic = 1;
+                /* Do not save these changes as last_*, they are temporary mds
+                 * overrides.
+                 */
+                bytesperword = KDB_WORD_SIZE;
+                repeat = mdcount;
+                kdbgetintenv("NOSECT", &nosect);
+        }
+        /* Round address down modulo BYTESPERWORD */
+        addr &= ~(bytesperword-1);
+        while (repeat > 0) {
+                unsigned long a;
+                int n, z, num = (symbolic ? 1 : (16 / bytesperword));
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
+                for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
+                        if (phys) {
+                                if (kdb_getphysword(&word, a, bytesperword)
+                                                || word)
+                                        break;
+                        } else if (kdb_getword(&word, a, bytesperword) || word)
+                                break;
+                }
+                n = min(num, repeat);
+                kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
+                            num, repeat, phys);
+                addr += bytesperword * n;
+                repeat -= n;
+                z = (z + num - 1) / num;
+                if (z > 2) {
+                        int s = num * (z-2);
+                        kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
+                                   " zero suppressed\n",
+                                addr, addr + bytesperword * s - 1);
+                        addr += bytesperword * s;
+                        repeat -= s;
+                }
+        }
+        last_addr = addr;
+        return 0;
+}
+/*
+ * kdb_mm - This function implements the 'mm' command.
+ *      mm address-expression new-value
+ * Remarks:
+ *      mm works on machine words, mmW works on bytes.
+ */
+static int kdb_mm(int argc, const char **argv)
+{
+        int diag;
+        unsigned long addr;
+        long offset = 0;
+        unsigned long contents;
+        int nextarg;
+        int width;
+        if (argv[0][2] && !isdigit(argv[0][2]))
+                return KDB_NOTFOUND;
+        if (argc < 2)
+                return KDB_ARGCOUNT;
+        nextarg = 1;
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+        if (diag)
+                return diag;
+        if (nextarg > argc)
+                return KDB_ARGCOUNT;
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
+        if (diag)
+                return diag;
+        if (nextarg != argc + 1)
+                return KDB_ARGCOUNT;
+        width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
+        diag = kdb_putword(addr, contents, width);
+        if (diag)
+                return diag;
+        kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
+        return 0;
+}
+/*
+ * kdb_go - This function implements the 'go' command.
+ *      go [address-expression]
+ */
+static int kdb_go(int argc, const char **argv)
+{
+        unsigned long addr;
+        int diag;
+        int nextarg;
+        long offset;
+        if (argc == 1) {
+                if (raw_smp_processor_id() != kdb_initial_cpu) {
+                        kdb_printf("go <address> must be issued from the "
+                                   "initial cpu, do cpu %d first\n",
+                                   kdb_initial_cpu);
+                        return KDB_ARGCOUNT;
+                }
+                nextarg = 1;
+                diag = kdbgetaddrarg(argc, argv, &nextarg,
+                                     &addr, &offset, NULL);
+                if (diag)
+                        return diag;
+        } else if (argc) {
+                return KDB_ARGCOUNT;
+        }
+        diag = KDB_CMD_GO;
+        if (KDB_FLAG(CATASTROPHIC)) {
+                kdb_printf("Catastrophic error detected\n");
+                kdb_printf("kdb_continue_catastrophic=%d, ",
+                        kdb_continue_catastrophic);
+                if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
+                        kdb_printf("type go a second time if you really want "
+                                   "to continue\n");
+                        return 0;
+                }
+                if (kdb_continue_catastrophic == 2) {
+                        kdb_printf("forcing reboot\n");
+                        kdb_reboot(0, NULL);
+                }
+                kdb_printf("attempting to continue\n");
+        }
+        return diag;
+}
+/*
+ * kdb_rd - This function implements the 'rd' command.
+ */
+static int kdb_rd(int argc, const char **argv)
+{
+        int diag = kdb_check_regs();
+        if (diag)
+                return diag;
+        kdb_dumpregs(kdb_current_regs);
+        return 0;
+}
+/*
+ * kdb_rm - This function implements the 'rm' (register modify)  command.
+ *      rm register-name new-contents
+ * Remarks:
+ *      Currently doesn't allow modification of control or
+ *      debug registers.
+ */
+static int kdb_rm(int argc, const char **argv)
+{
+        int diag;
+        int ind = 0;
+        unsigned long contents;
+        if (argc != 2)
+                return KDB_ARGCOUNT;
+        /*
+         * Allow presence or absence of leading '%' symbol.
+         */
+        if (argv[1][0] == '%')
+                ind = 1;
+        diag = kdbgetularg(argv[2], &contents);
+        if (diag)
+                return diag;
+        diag = kdb_check_regs();
+        if (diag)
+                return diag;
+        kdb_printf("ERROR: Register set currently not implemented\n");
+        return 0;
+}
+#if defined(CONFIG_MAGIC_SYSRQ)
+/*
+ * kdb_sr - This function implements the 'sr' (SYSRQ key) command
+ *      which interfaces to the soi-disant MAGIC SYSRQ functionality.
+ *              sr <magic-sysrq-code>
+ */
+static int kdb_sr(int argc, const char **argv)
+{
+        if (argc != 1)
+                return KDB_ARGCOUNT;
+        kdb_trap_printk++;
+        __handle_sysrq(*argv[1], NULL, 0);
+        kdb_trap_printk--;
+        return 0;
+}
+#endif  /* CONFIG_MAGIC_SYSRQ */
+/*
+ * kdb_ef - This function implements the 'regs' (display exception
+ *      frame) command.  This command takes an address and expects to
+ *      find an exception frame at that address, formats and prints
+ *      it.
+ *              regs address-expression
+ * Remarks:
+ *      Not done yet.
+ */
+static int kdb_ef(int argc, const char **argv)
+{
+        int diag;
+        unsigned long addr;
+        long offset;
+        int nextarg;
+        if (argc != 1)
+                return KDB_ARGCOUNT;
+        nextarg = 1;
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+        if (diag)
+                return diag;
+        show_regs((struct pt_regs *)addr);
+        return 0;
+}
+#if defined(CONFIG_MODULES)
+/*
+ * kdb_lsmod - This function implements the 'lsmod' command.  Lists
+ *      currently loaded kernel modules.
+ *      Mostly taken from userland lsmod.
+ */
+static int kdb_lsmod(int argc, const char **argv)
+{
+        struct module *mod;
+        if (argc != 0)
+                return KDB_ARGCOUNT;
+        kdb_printf("Module                  Size  modstruct     Used by\n");
+        list_for_each_entry(mod, kdb_modules, list) {
+                kdb_printf("%-20s%8u  0x%p ", mod->name,
+                           mod->core_size, (void *)mod);
+#ifdef CONFIG_MODULE_UNLOAD
+                kdb_printf("%4d ", module_refcount(mod));
+#endif
+                if (mod->state == MODULE_STATE_GOING)
+                        kdb_printf(" (Unloading)");
+                else if (mod->state == MODULE_STATE_COMING)
+                        kdb_printf(" (Loading)");
+                else
+                        kdb_printf(" (Live)");
+                kdb_printf(" 0x%p", mod->module_core);
+#ifdef CONFIG_MODULE_UNLOAD
+                {
+                        struct module_use *use;
+                        kdb_printf(" [ ");
+                        list_for_each_entry(use, &mod->source_list,
+                                            source_list)
+                                kdb_printf("%s ", use->target->name);
+                        kdb_printf("]\n");
+                }
+#endif
+        }
+        return 0;
+}
+#endif  /* CONFIG_MODULES */
+/*
+ * kdb_env - This function implements the 'env' command.  Display the
+ *      current environment variables.
+ */
+static int kdb_env(int argc, const char **argv)
+{
+        int i;
+        for (i = 0; i < __nenv; i++) {
+                if (__env[i])
+                        kdb_printf("%s\n", __env[i]);
+        }
+        if (KDB_DEBUG(MASK))
+                kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
+        return 0;
+}
+#ifdef CONFIG_PRINTK
+/*
+ * kdb_dmesg - This function implements the 'dmesg' command to display
+ *      the contents of the syslog buffer.
+ *              dmesg [lines] [adjust]
+ */
+static int kdb_dmesg(int argc, const char **argv)
+{
+        char *syslog_data[4], *start, *end, c = '\0', *p;
+        int diag, logging, logsize, lines = 0, adjust = 0, n;
+        if (argc > 2)
+                return KDB_ARGCOUNT;
+        if (argc) {
+                char *cp;
+                lines = simple_strtol(argv[1], &cp, 0);
+                if (*cp)
+                        lines = 0;
+                if (argc > 1) {
+                        adjust = simple_strtoul(argv[2], &cp, 0);
+                        if (*cp || adjust < 0)
+                                adjust = 0;
+                }
+        }
+        /* disable LOGGING if set */
+        diag = kdbgetintenv("LOGGING", &logging);
+        if (!diag && logging) {
+                const char *setargs[] = { "set", "LOGGING", "0" };
+                kdb_set(2, setargs);
+        }
+        /* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
+         * logical start, end+1. */
+        kdb_syslog_data(syslog_data);
+        if (syslog_data[2] == syslog_data[3])
+                return 0;
+        logsize = syslog_data[1] - syslog_data[0];
+        start = syslog_data[2];
+        end = syslog_data[3];
+#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
+        for (n = 0, p = start; p < end; ++p) {
+                c = *KDB_WRAP(p);
+                if (c == '\n')
+                        ++n;
+        }
+        if (c != '\n')
+                ++n;
+        if (lines < 0) {
+                if (adjust >= n)
+                        kdb_printf("buffer only contains %d lines, nothing "
+                                   "printed\n", n);
+                else if (adjust - lines >= n)
+                        kdb_printf("buffer only contains %d lines, last %d "
+                                   "lines printed\n", n, n - adjust);
+                if (adjust) {
+                        for (; start < end && adjust; ++start) {
+                                if (*KDB_WRAP(start) == '\n')
+                                        --adjust;
+                        }
+                        if (start < end)
+                                ++start;
+                }
+                for (p = start; p < end && lines; ++p) {
+                        if (*KDB_WRAP(p) == '\n')
+                                ++lines;
+                }
+                end = p;
+        } else if (lines > 0) {
+                int skip = n - (adjust + lines);
+                if (adjust >= n) {
+                        kdb_printf("buffer only contains %d lines, "
+                                   "nothing printed\n", n);
+                        skip = n;
+                } else if (skip < 0) {
+                        lines += skip;
+                        skip = 0;
+                        kdb_printf("buffer only contains %d lines, first "
+                                   "%d lines printed\n", n, lines);
+                }
+                for (; start < end && skip; ++start) {
+                        if (*KDB_WRAP(start) == '\n')
+                                --skip;
+                }
+                for (p = start; p < end && lines; ++p) {
+                        if (*KDB_WRAP(p) == '\n')
+                                --lines;
+                }
+                end = p;
+        }
+        /* Do a line at a time (max 200 chars) to reduce protocol overhead */
+        c = '\n';
+        while (start != end) {
+                char buf[201];
+                p = buf;
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
+                while (start < end && (c = *KDB_WRAP(start)) &&
+                       (p - buf) < sizeof(buf)-1) {
+                        ++start;
+                        *p++ = c;
+                        if (c == '\n')
+                                break;
+                }
+                *p = '\0';
+                kdb_printf("%s", buf);
+        }
+        if (c != '\n')
+                kdb_printf("\n");
+        return 0;
+}
+#endif /* CONFIG_PRINTK */
+/*
+ * kdb_cpu - This function implements the 'cpu' command.
+ *      cpu     [<cpunum>]
+ * Returns:
+ *      KDB_CMD_CPU for success, a kdb diagnostic if error
+ */
+static void kdb_cpu_status(void)
+{
+        int i, start_cpu, first_print = 1;
+        char state, prev_state = '?';
+        kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
+        kdb_printf("Available cpus: ");
+        for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
+                if (!cpu_online(i)) {
+                        state = 'F';    /* cpu is offline */
+                } else {
+                        state = ' ';    /* cpu is responding to kdb */
+                        if (kdb_task_state_char(KDB_TSK(i)) == 'I')
+                                state = 'I';    /* idle task */
+                }
+                if (state != prev_state) {
+                        if (prev_state != '?') {
+                                if (!first_print)
+                                        kdb_printf(", ");
+                                first_print = 0;
+                                kdb_printf("%d", start_cpu);
+                                if (start_cpu < i-1)
+                                        kdb_printf("-%d", i-1);
+                                if (prev_state != ' ')
+                                        kdb_printf("(%c)", prev_state);
+                        }
+                        prev_state = state;
+                        start_cpu = i;
+                }
+        }
+        /* print the trailing cpus, ignoring them if they are all offline */
+        if (prev_state != 'F') {
+                if (!first_print)
+                        kdb_printf(", ");
+                kdb_printf("%d", start_cpu);
+                if (start_cpu < i-1)
+                        kdb_printf("-%d", i-1);
+                if (prev_state != ' ')
+                        kdb_printf("(%c)", prev_state);
+        }
+        kdb_printf("\n");
+}
+static int kdb_cpu(int argc, const char **argv)
+{
+        unsigned long cpunum;
+        int diag;
+        if (argc == 0) {
+                kdb_cpu_status();
+                return 0;
+        }
+        if (argc != 1)
+                return KDB_ARGCOUNT;
+        diag = kdbgetularg(argv[1], &cpunum);
+        if (diag)
+                return diag;
+        /*
+         * Validate cpunum
+         */
+        if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
+                return KDB_BADCPUNUM;
+        dbg_switch_cpu = cpunum;
+        /*
+         * Switch to other cpu
+         */
+        return KDB_CMD_CPU;
+}
+/* The user may not realize that ps/bta with no parameters does not print idle
+ * or sleeping system daemon processes, so tell them how many were suppressed.
+ */
+void kdb_ps_suppressed(void)
+{
+        int idle = 0, daemon = 0;
+        unsigned long mask_I = kdb_task_state_string("I"),
+                      mask_M = kdb_task_state_string("M");
+        unsigned long cpu;
+        const struct task_struct *p, *g;
+        for_each_online_cpu(cpu) {
+                p = kdb_curr_task(cpu);
+                if (kdb_task_state(p, mask_I))
+                        ++idle;
+        }
+        kdb_do_each_thread(g, p) {
+                if (kdb_task_state(p, mask_M))
+                        ++daemon;
+        } kdb_while_each_thread(g, p);
+        if (idle || daemon) {
+                if (idle)
+                        kdb_printf("%d idle process%s (state I)%s\n",
+                                   idle, idle == 1 ? "" : "es",
+                                   daemon ? " and " : "");
+                if (daemon)
+                        kdb_printf("%d sleeping system daemon (state M) "
+                                   "process%s", daemon,
+                                   daemon == 1 ? "" : "es");
+                kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
+        }
+}
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ *      list of the active processes.
+ *              ps [DRSTCZEUIMA]   All processes, optionally filtered by state
+ */
+void kdb_ps1(const struct task_struct *p)
+{
+        int cpu;
+        unsigned long tmp;
+        if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+                return;
+        cpu = kdb_process_cpu(p);
+        kdb_printf("0x%p %8d %8d  %d %4d   %c  0x%p %c%s\n",
+                   (void *)p, p->pid, p->parent->pid,
+                   kdb_task_has_cpu(p), kdb_process_cpu(p),
+                   kdb_task_state_char(p),
+                   (void *)(&p->thread),
+                   p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
+                   p->comm);
+        if (kdb_task_has_cpu(p)) {
+                if (!KDB_TSK(cpu)) {
+                        kdb_printf("  Error: no saved data for this cpu\n");
+                } else {
+                        if (KDB_TSK(cpu) != p)
+                                kdb_printf("  Error: does not match running "
+                                   "process table (0x%p)\n", KDB_TSK(cpu));
+                }
+        }
+}
+static int kdb_ps(int argc, const char **argv)
+{
+        struct task_struct *g, *p;
+        unsigned long mask, cpu;
+        if (argc == 0)
+                kdb_ps_suppressed();
+        kdb_printf("%-*s      Pid   Parent [*] cpu State %-*s Command\n",
+                (int)(2*sizeof(void *))+2, "Task Addr",
+                (int)(2*sizeof(void *))+2, "Thread");
+        mask = kdb_task_state_string(argc ? argv[1] : NULL);
+        /* Run the active tasks first */
+        for_each_online_cpu(cpu) {
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
+                p = kdb_curr_task(cpu);
+                if (kdb_task_state(p, mask))
+                        kdb_ps1(p);
+        }
+        kdb_printf("\n");
+        /* Now the real tasks */
+        kdb_do_each_thread(g, p) {
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
+                if (kdb_task_state(p, mask))
+                        kdb_ps1(p);
+        } kdb_while_each_thread(g, p);
+        return 0;
+}
+/*
+ * kdb_pid - This function implements the 'pid' command which switches
+ *      the currently active process.
+ *              pid [<pid> | R]
+ */
+static int kdb_pid(int argc, const char **argv)
+{
+        struct task_struct *p;
+        unsigned long val;
+        int diag;
+        if (argc > 1)
+                return KDB_ARGCOUNT;
+        if (argc) {
+                if (strcmp(argv[1], "R") == 0) {
+                        p = KDB_TSK(kdb_initial_cpu);
+                } else {
+                        diag = kdbgetularg(argv[1], &val);
+                        if (diag)
+                                return KDB_BADINT;
+                        p = find_task_by_pid_ns((pid_t)val,     &init_pid_ns);
+                        if (!p) {
+                                kdb_printf("No task with pid=%d\n", (pid_t)val);
+                                return 0;
+                        }
+                }
+                kdb_set_current_task(p);
+        }
+        kdb_printf("KDB current process is %s(pid=%d)\n",
+                   kdb_current_task->comm,
+                   kdb_current_task->pid);
+        return 0;
+}
+/*
+ * kdb_ll - This function implements the 'll' command which follows a
+ *      linked list and executes an arbitrary command for each
+ *      element.
+ */
+static int kdb_ll(int argc, const char **argv)
+{
+        int diag;
+        unsigned long addr;
+        long offset = 0;
+        unsigned long va;
+        unsigned long linkoffset;
+        int nextarg;
+        const char *command;
+        if (argc != 3)
+                return KDB_ARGCOUNT;
+        nextarg = 1;
+        diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
+        if (diag)
+                return diag;
+        diag = kdbgetularg(argv[2], &linkoffset);
+        if (diag)
+                return diag;
+        /*
+         * Using the starting address as
+         * the first element in the list, and assuming that
+         * the list ends with a null pointer.
+         */
+        va = addr;
+        command = kdb_strdup(argv[3], GFP_KDB);
+        if (!command) {
+                kdb_printf("%s: cannot duplicate command\n", __func__);
+                return 0;
+        }
+        /* Recursive use of kdb_parse, do not use argv after this point */
+        argv = NULL;
+        while (va) {
+                char buf[80];
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
+                sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
+                diag = kdb_parse(buf);
+                if (diag)
+                        return diag;
+                addr = va + linkoffset;
+                if (kdb_getword(&va, addr, sizeof(va)))
+                        return 0;
+        }
+        kfree(command);
+        return 0;
+}
+static int kdb_kgdb(int argc, const char **argv)
+{
+        return KDB_CMD_KGDB;
+}
+/*
+ * kdb_help - This function implements the 'help' and '?' commands.
+ */
+static int kdb_help(int argc, const char **argv)
+{
+        kdbtab_t *kt;
+        int i;
+        kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
+        kdb_printf("-----------------------------"
+                   "-----------------------------\n");
+        for_each_kdbcmd(kt, i) {
+                if (kt->cmd_name)
+                        kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
+                                   kt->cmd_usage, kt->cmd_help);
+                if (KDB_FLAG(CMD_INTERRUPT))
+                        return 0;
+        }
+        return 0;
+}
+/*
+ * kdb_kill - This function implements the 'kill' commands.
+ */
+static int kdb_kill(int argc, const char **argv)
+{
+        long sig, pid;
+        char *endp;
+        struct task_struct *p;
+        struct siginfo info;
+        if (argc != 2)
+                return KDB_ARGCOUNT;
+        sig = simple_strtol(argv[1], &endp, 0);
+        if (*endp)
+                return KDB_BADINT;
+        if (sig >= 0) {
+                kdb_printf("Invalid signal parameter.<-signal>\n");
+                return 0;
+        }
+        sig = -sig;
+        pid = simple_strtol(argv[2], &endp, 0);
+        if (*endp)
+                return KDB_BADINT;
+        if (pid <= 0) {
+                kdb_printf("Process ID must be large than 0.\n");
+                return 0;
+        }
+        /* Find the process. */
+        p = find_task_by_pid_ns(pid, &init_pid_ns);
+        if (!p) {
+                kdb_printf("The specified process isn't found.\n");
+                return 0;
+        }
+        p = p->group_leader;
+        info.si_signo = sig;
+        info.si_errno = 0;
+        info.si_code = SI_USER;
+        info.si_pid = pid;  /* same capabilities as process being signalled */
+        info.si_uid = 0;    /* kdb has root authority */
+        kdb_send_sig_info(p, &info);
+        return 0;
+}
+struct kdb_tm {
+        int tm_sec;     /* seconds */
+        int tm_min;     /* minutes */
+        int tm_hour;    /* hours */
+        int tm_mday;    /* day of the month */
+        int tm_mon;     /* month */
+        int tm_year;    /* year */
+};
+static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
+{
+        /* This will work from 1970-2099, 2100 is not a leap year */
+        static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
+                                 31, 30, 31, 30, 31 };
+        memset(tm, 0, sizeof(*tm));
+        tm->tm_sec  = tv->tv_sec % (24 * 60 * 60);
+        tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
+                (2 * 365 + 1); /* shift base from 1970 to 1968 */
+        tm->tm_min =  tm->tm_sec / 60 % 60;
+        tm->tm_hour = tm->tm_sec / 60 / 60;
+        tm->tm_sec =  tm->tm_sec % 60;
+        tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
+        tm->tm_mday %= (4*365+1);
+        mon_day[1] = 29;
+        while (tm->tm_mday >= mon_day[tm->tm_mon]) {
+                tm->tm_mday -= mon_day[tm->tm_mon];
+                if (++tm->tm_mon == 12) {
+                        tm->tm_mon = 0;
+                        ++tm->tm_year;
+                        mon_day[1] = 28;
+                }
+        }
+        ++tm->tm_mday;
+}
+/*
+ * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
+ * I cannot call that code directly from kdb, it has an unconditional
+ * cli()/sti() and calls routines that take locks which can stop the debugger.
+ */
+static void kdb_sysinfo(struct sysinfo *val)
+{
+        struct timespec uptime;
+        do_posix_clock_monotonic_gettime(&uptime);
+        memset(val, 0, sizeof(*val));
+        val->uptime = uptime.tv_sec;
+        val->loads[0] = avenrun[0];
+        val->loads[1] = avenrun[1];
+        val->loads[2] = avenrun[2];
+        val->procs = nr_threads-1;
+        si_meminfo(val);
+        return;
+}
+/*
+ * kdb_summary - This function implements the 'summary' command.
+ */
+static int kdb_summary(int argc, const char **argv)
+{
+        struct kdb_tm tm;
+        struct sysinfo val;
+        if (argc)
+                return KDB_ARGCOUNT;
+        kdb_printf("sysname    %s\n", init_uts_ns.name.sysname);
+        kdb_printf("release    %s\n", init_uts_ns.name.release);
+        kdb_printf("version    %s\n", init_uts_ns.name.version);
+        kdb_printf("machine    %s\n", init_uts_ns.name.machine);
+        kdb_printf("nodename   %s\n", init_uts_ns.name.nodename);
+        kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
+        kdb_printf("ccversion  %s\n", __stringify(CCVERSION));
+        kdb_gmtime(&xtime, &tm);
+        kdb_printf("date       %04d-%02d-%02d %02d:%02d:%02d "
+                   "tz_minuteswest %d\n",
+                1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
+                tm.tm_hour, tm.tm_min, tm.tm_sec,
+                sys_tz.tz_minuteswest);
+        kdb_sysinfo(&val);
+        kdb_printf("uptime     ");
+        if (val.uptime > (24*60*60)) {
+                int days = val.uptime / (24*60*60);
+                val.uptime %= (24*60*60);
+                kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
+        }
+        kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+        /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+        kdb_printf("load avg   %ld.%02ld %ld.%02ld %ld.%02ld\n",
+                LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+                LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+                LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+#undef LOAD_INT
+#undef LOAD_FRAC
+        /* Display in kilobytes */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+        kdb_printf("\nMemTotal:       %8lu kB\nMemFree:        %8lu kB\n"
+                   "Buffers:        %8lu kB\n",
+                   val.totalram, val.freeram, val.bufferram);
+        return 0;
+}
+/*
+ * kdb_per_cpu - This function implements the 'per_cpu' command.
+ */
+static int kdb_per_cpu(int argc, const char **argv)
+{
+        char buf[256], fmtstr[64];
+        kdb_symtab_t symtab;
+        cpumask_t suppress = CPU_MASK_NONE;
+        int cpu, diag;
+        unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
+        if (argc < 1 || argc > 3)
+                return KDB_ARGCOUNT;
+        snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
+        if (!kdbgetsymval(buf, &symtab)) {
+                kdb_printf("%s is not a per_cpu variable\n", argv[1]);
+                return KDB_BADADDR;
+        }
+        if (argc >= 2) {
+                diag = kdbgetularg(argv[2], &bytesperword);
+                if (diag)
+                        return diag;
+        }
+        if (!bytesperword)
+                bytesperword = KDB_WORD_SIZE;
+        else if (bytesperword > KDB_WORD_SIZE)
+                return KDB_BADWIDTH;
+        sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
+        if (argc >= 3) {
+                diag = kdbgetularg(argv[3], &whichcpu);
+                if (diag)
+                        return diag;
+                if (!cpu_online(whichcpu)) {
+                        kdb_printf("cpu %ld is not online\n", whichcpu);
+                        return KDB_BADCPUNUM;
+                }
+        }
+        /* Most architectures use __per_cpu_offset[cpu], some use
+         * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
+         */
+#ifdef  __per_cpu_offset
+#define KDB_PCU(cpu) __per_cpu_offset(cpu)
+#else
+#ifdef  CONFIG_SMP
+#define KDB_PCU(cpu) __per_cpu_offset[cpu]
+#else
+#define KDB_PCU(cpu) 0
+#endif
+#endif
+        for_each_online_cpu(cpu) {
+                if (whichcpu != ~0UL && whichcpu != cpu)
+                        continue;
+                addr = symtab.sym_start + KDB_PCU(cpu);
+                diag = kdb_getword(&val, addr, bytesperword);
+                if (diag) {
+                        kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
+                                   "read, diag=%d\n", cpu, addr, diag);
+                        continue;
+                }
+#ifdef  CONFIG_SMP
+                if (!val) {
+                        cpu_set(cpu, suppress);
+                        continue;
+                }
+#endif  /* CONFIG_SMP */
+                kdb_printf("%5d ", cpu);
+                kdb_md_line(fmtstr, addr,
+                        bytesperword == KDB_WORD_SIZE,
+                        1, bytesperword, 1, 1, 0);
+        }
+        if (cpus_weight(suppress) == 0)
+                return 0;
+        kdb_printf("Zero suppressed cpu(s):");
+        for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
+             cpu = next_cpu(cpu, suppress)) {
+                kdb_printf(" %d", cpu);
+                if (cpu == num_possible_cpus() - 1 ||
+                    next_cpu(cpu, suppress) != cpu + 1)
+                        continue;
+                while (cpu < num_possible_cpus() &&
+                       next_cpu(cpu, suppress) == cpu + 1)
+                        ++cpu;
+                kdb_printf("-%d", cpu);
+        }
+        kdb_printf("\n");
+#undef KDB_PCU
+        return 0;
+}
+/*
+ * display help for the use of cmd | grep pattern
+ */
+static int kdb_grep_help(int argc, const char **argv)
+{
+        kdb_printf("Usage of  cmd args | grep pattern:\n");
+        kdb_printf("  Any command's output may be filtered through an ");
+        kdb_printf("emulated 'pipe'.\n");
+        kdb_printf("  'grep' is just a key word.\n");
+        kdb_printf("  The pattern may include a very limited set of "
+                   "metacharacters:\n");
+        kdb_printf("   pattern or ^pattern or pattern$ or ^pattern$\n");
+        kdb_printf("  And if there are spaces in the pattern, you may "
+                   "quote it:\n");
+        kdb_printf("   \"pat tern\" or \"^pat tern\" or \"pat tern$\""
+                   " or \"^pat tern$\"\n");
+        return 0;
+}
+/*
+ * kdb_register_repeat - This function is used to register a kernel
+ *      debugger command.
+ * Inputs:
+ *      cmd     Command name
+ *      func    Function to execute the command
+ *      usage   A simple usage string showing arguments
+ *      help    A simple help string describing command
+ *      repeat  Does the command auto repeat on enter?
+ * Returns:
+ *      zero for success, one if a duplicate command.
+ */
+#define kdb_command_extend 50   /* arbitrary */
+int kdb_register_repeat(char *cmd,
+                        kdb_func_t func,
+                        char *usage,
+                        char *help,
+                        short minlen,
+                        kdb_repeat_t repeat)
+{
+        int i;
+        kdbtab_t *kp;
+        /*
+         *  Brute force method to determine duplicates
+         */
+        for_each_kdbcmd(kp, i) {
+                if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+                        kdb_printf("Duplicate kdb command registered: "
+                                "%s, func %p help %s\n", cmd, func, help);
+                        return 1;
+                }
+        }
+        /*
+         * Insert command into first available location in table
+         */
+        for_each_kdbcmd(kp, i) {
+                if (kp->cmd_name == NULL)
+                        break;
+        }
+        if (i >= kdb_max_commands) {
+                kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
+                         kdb_command_extend) * sizeof(*new), GFP_KDB);
+                if (!new) {
+                        kdb_printf("Could not allocate new kdb_command "
+                                   "table\n");
+                        return 1;
+                }
+                if (kdb_commands) {
+                        memcpy(new, kdb_commands,
+                               kdb_max_commands * sizeof(*new));
+                        kfree(kdb_commands);
+                }
+                memset(new + kdb_max_commands, 0,
+                       kdb_command_extend * sizeof(*new));
+                kdb_commands = new;
+                kp = kdb_commands + kdb_max_commands;
+                kdb_max_commands += kdb_command_extend;
+        }
+        kp->cmd_name   = cmd;
+        kp->cmd_func   = func;
+        kp->cmd_usage  = usage;
+        kp->cmd_help   = help;
+        kp->cmd_flags  = 0;
+        kp->cmd_minlen = minlen;
+        kp->cmd_repeat = repeat;
+        return 0;
+}
+/*
+ * kdb_register - Compatibility register function for commands that do
+ *      not need to specify a repeat state.  Equivalent to
+ *      kdb_register_repeat with KDB_REPEAT_NONE.
+ * Inputs:
+ *      cmd     Command name
+ *      func    Function to execute the command
+ *      usage   A simple usage string showing arguments
+ *      help    A simple help string describing command
+ * Returns:
+ *      zero for success, one if a duplicate command.
+ */
+int kdb_register(char *cmd,
+             kdb_func_t func,
+             char *usage,
+             char *help,
+             short minlen)
+{
+        return kdb_register_repeat(cmd, func, usage, help, minlen,
+                                   KDB_REPEAT_NONE);
+}
+/*
+ * kdb_unregister - This function is used to unregister a kernel
+ *      debugger command.  It is generally called when a module which
+ *      implements kdb commands is unloaded.
+ * Inputs:
+ *      cmd     Command name
+ * Returns:
+ *      zero for success, one command not registered.
+ */
+int kdb_unregister(char *cmd)
+{
+        int i;
+        kdbtab_t *kp;
+        /*
+         *  find the command.
+         */
+        for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
+                if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
+                        kp->cmd_name = NULL;
+                        return 0;
+                }
+        }
+        /* Couldn't find it.  */
+        return 1;
+}
+/* Initialize the kdb command table. */
+static void __init kdb_inittab(void)
+{
+        int i;
+        kdbtab_t *kp;
+        for_each_kdbcmd(kp, i)
+                kp->cmd_name = NULL;
+        kdb_register_repeat("md", kdb_md, "<vaddr>",
+          "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
+                            KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
+          "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
+          "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("mds", kdb_md, "<vaddr>",
+          "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
+          "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
+        kdb_register_repeat("go", kdb_go, "[<vaddr>]",
+          "Continue Execution", 1, KDB_REPEAT_NONE);
+        kdb_register_repeat("rd", kdb_rd, "",
+          "Display Registers", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
+          "Modify Registers", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("ef", kdb_ef, "<vaddr>",
+          "Display exception frame", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
+          "Stack traceback", 1, KDB_REPEAT_NONE);
+        kdb_register_repeat("btp", kdb_bt, "<pid>",
+          "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
+          "Display stack all processes", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("btc", kdb_bt, "",
+          "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("btt", kdb_bt, "<vaddr>",
+          "Backtrace process given its struct task address", 0,
+                            KDB_REPEAT_NONE);
+        kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
+          "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("env", kdb_env, "",
+          "Show environment variables", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("set", kdb_set, "",
+          "Set environment variables", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("help", kdb_help, "",
+          "Display Help Message", 1, KDB_REPEAT_NONE);
+        kdb_register_repeat("?", kdb_help, "",
+          "Display Help Message", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
+          "Switch to new cpu", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("kgdb", kdb_kgdb, "",
+          "Enter kgdb mode", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
+          "Display active task list", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("pid", kdb_pid, "<pidnum>",
+          "Switch to another task", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("reboot", kdb_reboot, "",
+          "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
+#if defined(CONFIG_MODULES)
+        kdb_register_repeat("lsmod", kdb_lsmod, "",
+          "List loaded kernel modules", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_MAGIC_SYSRQ)
+        kdb_register_repeat("sr", kdb_sr, "<key>",
+          "Magic SysRq key", 0, KDB_REPEAT_NONE);
+#endif
+#if defined(CONFIG_PRINTK)
+        kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
+          "Display syslog buffer", 0, KDB_REPEAT_NONE);
+#endif
+        kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
+          "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
+          "Send a signal to a process", 0, KDB_REPEAT_NONE);
+        kdb_register_repeat("summary", kdb_summary, "",
+          "Summarize the system", 4, KDB_REPEAT_NONE);
+        kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+          "Display per_cpu variables", 3, KDB_REPEAT_NONE);
+        kdb_register_repeat("grephelp", kdb_grep_help, "",
+          "Display help on | grep", 0, KDB_REPEAT_NONE);
+}
+/* Execute any commands defined in kdb_cmds.  */
+static void __init kdb_cmd_init(void)
+{
+        int i, diag;
+        for (i = 0; kdb_cmds[i]; ++i) {
+                diag = kdb_parse(kdb_cmds[i]);
+                if (diag)
+                        kdb_printf("kdb command %s failed, kdb diag %d\n",
+                                kdb_cmds[i], diag);
+        }
+        if (defcmd_in_progress) {
+                kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
+                kdb_parse("endefcmd");
+        }
+}
+/* Intialize kdb_printf, breakpoint tables and kdb state */
+void __init kdb_init(int lvl)
+{
+        static int kdb_init_lvl = KDB_NOT_INITIALIZED;
+        int i;
+        if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
+                return;
+        for (i = kdb_init_lvl; i < lvl; i++) {
+                switch (i) {
+                case KDB_NOT_INITIALIZED:
+                        kdb_inittab();          /* Initialize Command Table */
+                        kdb_initbptab();        /* Initialize Breakpoints */
+                        break;
+                case KDB_INIT_EARLY:
+                        kdb_cmd_init();         /* Build kdb_cmds tables */
+                        break;
+                }
+        }
+        kdb_init_lvl = lvl;
+}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..97d3ba69775d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,300 @@
+#ifndef _KDBPRIVATE_H
+#define _KDBPRIVATE_H
+/*
+ * Kernel Debugger Architecture Independent Private Headers
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ */
+#include <linux/kgdb.h>
+#include "../debug_core.h"
+/* Kernel Debugger Error codes.  Must not overlap with command codes. */
+#define KDB_NOTFOUND    (-1)
+#define KDB_ARGCOUNT    (-2)
+#define KDB_BADWIDTH    (-3)
+#define KDB_BADRADIX    (-4)
+#define KDB_NOTENV      (-5)
+#define KDB_NOENVVALUE  (-6)
+#define KDB_NOTIMP      (-7)
+#define KDB_ENVFULL     (-8)
+#define KDB_ENVBUFFULL  (-9)
+#define KDB_TOOMANYBPT  (-10)
+#define KDB_TOOMANYDBREGS (-11)
+#define KDB_DUPBPT      (-12)
+#define KDB_BPTNOTFOUND (-13)
+#define KDB_BADMODE     (-14)
+#define KDB_BADINT      (-15)
+#define KDB_INVADDRFMT  (-16)
+#define KDB_BADREG      (-17)
+#define KDB_BADCPUNUM   (-18)
+#define KDB_BADLENGTH   (-19)
+#define KDB_NOBP        (-20)
+#define KDB_BADADDR     (-21)
+/* Kernel Debugger Command codes.  Must not overlap with error codes. */
+#define KDB_CMD_GO      (-1001)
+#define KDB_CMD_CPU     (-1002)
+#define KDB_CMD_SS      (-1003)
+#define KDB_CMD_SSB     (-1004)
+#define KDB_CMD_KGDB (-1005)
+#define KDB_CMD_KGDB2 (-1006)
+/* Internal debug flags */
+#define KDB_DEBUG_FLAG_BP       0x0002  /* Breakpoint subsystem debug */
+#define KDB_DEBUG_FLAG_BB_SUMM  0x0004  /* Basic block analysis, summary only */
+#define KDB_DEBUG_FLAG_AR       0x0008  /* Activation record, generic */
+#define KDB_DEBUG_FLAG_ARA      0x0010  /* Activation record, arch specific */
+#define KDB_DEBUG_FLAG_BB       0x0020  /* All basic block analysis */
+#define KDB_DEBUG_FLAG_STATE    0x0040  /* State flags */
+#define KDB_DEBUG_FLAG_MASK     0xffff  /* All debug flags */
+#define KDB_DEBUG_FLAG_SHIFT    16      /* Shift factor for dbflags */
+#define KDB_DEBUG(flag) (kdb_flags & \
+        (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
+#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
+                kdb_print_state(text, value)
+#if BITS_PER_LONG == 32
+#define KDB_PLATFORM_ENV        "BYTESPERWORD=4"
+#define kdb_machreg_fmt         "0x%lx"
+#define kdb_machreg_fmt0        "0x%08lx"
+#define kdb_bfd_vma_fmt         "0x%lx"
+#define kdb_bfd_vma_fmt0        "0x%08lx"
+#define kdb_elfw_addr_fmt       "0x%x"
+#define kdb_elfw_addr_fmt0      "0x%08x"
+#define kdb_f_count_fmt         "%d"
+#elif BITS_PER_LONG == 64
+#define KDB_PLATFORM_ENV        "BYTESPERWORD=8"
+#define kdb_machreg_fmt         "0x%lx"
+#define kdb_machreg_fmt0        "0x%016lx"
+#define kdb_bfd_vma_fmt         "0x%lx"
+#define kdb_bfd_vma_fmt0        "0x%016lx"
+#define kdb_elfw_addr_fmt       "0x%x"
+#define kdb_elfw_addr_fmt0      "0x%016x"
+#define kdb_f_count_fmt         "%ld"
+#endif
+/*
+ * KDB_MAXBPT describes the total number of breakpoints
+ * supported by this architecure.
+ */
+#define KDB_MAXBPT      16
+/* Maximum number of arguments to a function  */
+#define KDB_MAXARGS    16
+typedef enum {
+        KDB_REPEAT_NONE = 0,    /* Do not repeat this command */
+        KDB_REPEAT_NO_ARGS,     /* Repeat the command without arguments */
+        KDB_REPEAT_WITH_ARGS,   /* Repeat the command including its arguments */
+} kdb_repeat_t;
+typedef int (*kdb_func_t)(int, const char **);
+/* Symbol table format returned by kallsyms. */
+typedef struct __ksymtab {
+                unsigned long value;    /* Address of symbol */
+                const char *mod_name;   /* Module containing symbol or
+                                         * "kernel" */
+                unsigned long mod_start;
+                unsigned long mod_end;
+                const char *sec_name;   /* Section containing symbol */
+                unsigned long sec_start;
+                unsigned long sec_end;
+                const char *sym_name;   /* Full symbol name, including
+                                         * any version */
+                unsigned long sym_start;
+                unsigned long sym_end;
+                } kdb_symtab_t;
+extern int kallsyms_symbol_next(char *prefix_name, int flag);
+extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
+/* Exported Symbols for kernel loadable modules to use. */
+extern int kdb_register(char *, kdb_func_t, char *, char *, short);
+extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
+                               short, kdb_repeat_t);
+extern int kdb_unregister(char *);
+extern int kdb_getarea_size(void *, unsigned long, size_t);
+extern int kdb_putarea_size(unsigned long, void *, size_t);
+/*
+ * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
+ * names, not pointers.  The underlying *_size functions take pointers.
+ */
+#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
+#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
+extern int kdb_getphysword(unsigned long *word,
+                        unsigned long addr, size_t size);
+extern int kdb_getword(unsigned long *, unsigned long, size_t);
+extern int kdb_putword(unsigned long, unsigned long, size_t);
+extern int kdbgetularg(const char *, unsigned long *);
+extern int kdb_set(int, const char **);
+extern char *kdbgetenv(const char *);
+extern int kdbgetintenv(const char *, int *);
+extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
+                         long *, char **);
+extern int kdbgetsymval(const char *, kdb_symtab_t *);
+extern int kdbnearsym(unsigned long, kdb_symtab_t *);
+extern void kdbnearsym_cleanup(void);
+extern char *kdb_strdup(const char *str, gfp_t type);
+extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
+/* Routine for debugging the debugger state. */
+extern void kdb_print_state(const char *, int);
+extern int kdb_state;
+#define KDB_STATE_KDB           0x00000001      /* Cpu is inside kdb */
+#define KDB_STATE_LEAVING       0x00000002      /* Cpu is leaving kdb */
+#define KDB_STATE_CMD           0x00000004      /* Running a kdb command */
+#define KDB_STATE_KDB_CONTROL   0x00000008      /* This cpu is under
+                                                 * kdb control */
+#define KDB_STATE_HOLD_CPU      0x00000010      /* Hold this cpu inside kdb */
+#define KDB_STATE_DOING_SS      0x00000020      /* Doing ss command */
+#define KDB_STATE_DOING_SSB     0x00000040      /* Doing ssb command,
+                                                 * DOING_SS is also set */
+#define KDB_STATE_SSBPT         0x00000080      /* Install breakpoint
+                                                 * after one ss, independent of
+                                                 * DOING_SS */
+#define KDB_STATE_REENTRY       0x00000100      /* Valid re-entry into kdb */
+#define KDB_STATE_SUPPRESS      0x00000200      /* Suppress error messages */
+#define KDB_STATE_PAGER         0x00000400      /* pager is available */
+#define KDB_STATE_GO_SWITCH     0x00000800      /* go is switching
+                                                 * back to initial cpu */
+#define KDB_STATE_PRINTF_LOCK   0x00001000      /* Holds kdb_printf lock */
+#define KDB_STATE_WAIT_IPI      0x00002000      /* Waiting for kdb_ipi() NMI */
+#define KDB_STATE_RECURSE       0x00004000      /* Recursive entry to kdb */
+#define KDB_STATE_IP_ADJUSTED   0x00008000      /* Restart IP has been
+                                                 * adjusted */
+#define KDB_STATE_GO1           0x00010000      /* go only releases one cpu */
+#define KDB_STATE_KEYBOARD      0x00020000      /* kdb entered via
+                                                 * keyboard on this cpu */
+#define KDB_STATE_KEXEC         0x00040000      /* kexec issued */
+#define KDB_STATE_DOING_KGDB    0x00080000      /* kgdb enter now issued */
+#define KDB_STATE_DOING_KGDB2   0x00100000      /* kgdb enter now issued */
+#define KDB_STATE_KGDB_TRANS    0x00200000      /* Transition to kgdb */
+#define KDB_STATE_ARCH          0xff000000      /* Reserved for arch
+                                                 * specific use */
+#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
+#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
+#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
+extern int kdb_nextline; /* Current number of lines displayed */
+typedef struct _kdb_bp {
+        unsigned long   bp_addr;        /* Address breakpoint is present at */
+        unsigned int    bp_free:1;      /* This entry is available */
+        unsigned int    bp_enabled:1;   /* Breakpoint is active in register */
+        unsigned int    bp_type:4;      /* Uses hardware register */
+        unsigned int    bp_installed:1; /* Breakpoint is installed */
+        unsigned int    bp_delay:1;     /* Do delayed bp handling */
+        unsigned int    bp_delayed:1;   /* Delayed breakpoint */
+        unsigned int    bph_length;     /* HW break length */
+} kdb_bp_t;
+#ifdef CONFIG_KGDB_KDB
+extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
+/* The KDB shell command table */
+typedef struct _kdbtab {
+        char    *cmd_name;              /* Command name */
+        kdb_func_t cmd_func;            /* Function to execute command */
+        char    *cmd_usage;             /* Usage String for this command */
+        char    *cmd_help;              /* Help message for this command */
+        short    cmd_flags;             /* Parsing flags */
+        short    cmd_minlen;            /* Minimum legal # command
+                                         * chars required */
+        kdb_repeat_t cmd_repeat;        /* Does command auto repeat on enter? */
+} kdbtab_t;
+extern int kdb_bt(int, const char **);  /* KDB display back trace */
+/* KDB breakpoint management functions */
+extern void kdb_initbptab(void);
+extern void kdb_bp_install(struct pt_regs *);
+extern void kdb_bp_remove(void);
+typedef enum {
+        KDB_DB_BPT,     /* Breakpoint */
+        KDB_DB_SS,      /* Single-step trap */
+        KDB_DB_SSB,     /* Single step to branch */
+        KDB_DB_SSBPT,   /* Single step over breakpoint */
+        KDB_DB_NOBPT    /* Spurious breakpoint */
+} kdb_dbtrap_t;
+extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
+                         int, kdb_dbtrap_t, struct pt_regs *);
+/* Miscellaneous functions and data areas */
+extern int kdb_grepping_flag;
+extern char kdb_grep_string[];
+extern int kdb_grep_leading;
+extern int kdb_grep_trailing;
+extern char *kdb_cmds[];
+extern void kdb_syslog_data(char *syslog_data[]);
+extern unsigned long kdb_task_state_string(const char *);
+extern char kdb_task_state_char (const struct task_struct *);
+extern unsigned long kdb_task_state(const struct task_struct *p,
+                                    unsigned long mask);
+extern void kdb_ps_suppressed(void);
+extern void kdb_ps1(const struct task_struct *p);
+extern void kdb_print_nameval(const char *name, unsigned long val);
+extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
+extern void kdb_meminfo_proc_show(void);
+extern const char *kdb_walk_kallsyms(loff_t *pos);
+extern char *kdb_getstr(char *, size_t, char *);
+/* Defines for kdb_symbol_print */
+#define KDB_SP_SPACEB   0x0001          /* Space before string */
+#define KDB_SP_SPACEA   0x0002          /* Space after string */
+#define KDB_SP_PAREN    0x0004          /* Parenthesis around string */
+#define KDB_SP_VALUE    0x0008          /* Print the value of the address */
+#define KDB_SP_SYMSIZE  0x0010          /* Print the size of the symbol */
+#define KDB_SP_NEWLINE  0x0020          /* Newline after string */
+#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
+#define KDB_TSK(cpu) kgdb_info[cpu].task
+#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
+extern struct task_struct *kdb_curr_task(int);
+#define kdb_task_has_cpu(p) (task_curr(p))
+/* Simplify coexistence with NPTL */
+#define kdb_do_each_thread(g, p) do_each_thread(g, p)
+#define kdb_while_each_thread(g, p) while_each_thread(g, p)
+#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+extern void *debug_kmalloc(size_t size, gfp_t flags);
+extern void debug_kfree(void *);
+extern void debug_kusage(void);
+extern void kdb_set_current_task(struct task_struct *);
+extern struct task_struct *kdb_current_task;
+#ifdef CONFIG_MODULES
+extern struct list_head *kdb_modules;
+#endif /* CONFIG_MODULES */
+extern char kdb_prompt_str[];
+#define KDB_WORD_SIZE   ((int)sizeof(unsigned long))
+#endif /* CONFIG_KGDB_KDB */
+#endif  /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..45344d5c53dd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
+/*
+ * Kernel Debugger Architecture Independent Support Functions
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 1999-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2009 Wind River Systems, Inc.  All Rights Reserved.
+ * 03/02/13    added new 2.5 kallsyms <xavier.bru@bull.net>
+ */
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kallsyms.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+#include <linux/ptrace.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/kdb.h>
+#include <linux/slab.h>
+#include "kdb_private.h"
+/*
+ * kdbgetsymval - Return the address of the given symbol.
+ *
+ * Parameters:
+ *      symname Character string containing symbol name
+ *      symtab  Structure to receive results
+ * Returns:
+ *      0       Symbol not found, symtab zero filled
+ *      1       Symbol mapped to module/symbol/section, data in symtab
+ */
+int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
+{
+        if (KDB_DEBUG(AR))
+                kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
+                           symtab);
+        memset(symtab, 0, sizeof(*symtab));
+        symtab->sym_start = kallsyms_lookup_name(symname);
+        if (symtab->sym_start) {
+                if (KDB_DEBUG(AR))
+                        kdb_printf("kdbgetsymval: returns 1, "
+                                   "symtab->sym_start=0x%lx\n",
+                                   symtab->sym_start);
+                return 1;
+        }
+        if (KDB_DEBUG(AR))
+                kdb_printf("kdbgetsymval: returns 0\n");
+        return 0;
+}
+EXPORT_SYMBOL(kdbgetsymval);
+static char *kdb_name_table[100];       /* arbitrary size */
+/*
+ * kdbnearsym - Return the name of the symbol with the nearest address
+ *      less than 'addr'.
+ *
+ * Parameters:
+ *      addr    Address to check for symbol near
+ *      symtab  Structure to receive results
+ * Returns:
+ *      0       No sections contain this address, symtab zero filled
+ *      1       Address mapped to module/symbol/section, data in symtab
+ * Remarks:
+ *      2.6 kallsyms has a "feature" where it unpacks the name into a
+ *      string.  If that string is reused before the caller expects it
+ *      then the caller sees its string change without warning.  To
+ *      avoid cluttering up the main kdb code with lots of kdb_strdup,
+ *      tests and kfree calls, kdbnearsym maintains an LRU list of the
+ *      last few unique strings.  The list is sized large enough to
+ *      hold active strings, no kdb caller of kdbnearsym makes more
+ *      than ~20 later calls before using a saved value.
+ */
+int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
+{
+        int ret = 0;
+        unsigned long symbolsize;
+        unsigned long offset;
+#define knt1_size 128           /* must be >= kallsyms table size */
+        char *knt1 = NULL;
+        if (KDB_DEBUG(AR))
+                kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
+        memset(symtab, 0, sizeof(*symtab));
+        if (addr < 4096)
+                goto out;
+        knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
+        if (!knt1) {
+                kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
+                           addr);
+                goto out;
+        }
+        symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
+                                (char **)(&symtab->mod_name), knt1);
+        if (offset > 8*1024*1024) {
+                symtab->sym_name = NULL;
+                addr = offset = symbolsize = 0;
+        }
+        symtab->sym_start = addr - offset;
+        symtab->sym_end = symtab->sym_start + symbolsize;
+        ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
+        if (ret) {
+                int i;
+                /* Another 2.6 kallsyms "feature".  Sometimes the sym_name is
+                 * set but the buffer passed into kallsyms_lookup is not used,
+                 * so it contains garbage.  The caller has to work out which
+                 * buffer needs to be saved.
+                 *
+                 * What was Rusty smoking when he wrote that code?
+                 */
+                if (symtab->sym_name != knt1) {
+                        strncpy(knt1, symtab->sym_name, knt1_size);
+                        knt1[knt1_size-1] = '\0';
+                }
+                for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+                        if (kdb_name_table[i] &&
+                            strcmp(kdb_name_table[i], knt1) == 0)
+                                break;
+                }
+                if (i >= ARRAY_SIZE(kdb_name_table)) {
+                        debug_kfree(kdb_name_table[0]);
+                        memcpy(kdb_name_table, kdb_name_table+1,
+                               sizeof(kdb_name_table[0]) *
+                               (ARRAY_SIZE(kdb_name_table)-1));
+                } else {
+                        debug_kfree(knt1);
+                        knt1 = kdb_name_table[i];
+                        memcpy(kdb_name_table+i, kdb_name_table+i+1,
+                               sizeof(kdb_name_table[0]) *
+                               (ARRAY_SIZE(kdb_name_table)-i-1));
+                }
+                i = ARRAY_SIZE(kdb_name_table) - 1;
+                kdb_name_table[i] = knt1;
+                symtab->sym_name = kdb_name_table[i];
+                knt1 = NULL;
+        }
+        if (symtab->mod_name == NULL)
+                symtab->mod_name = "kernel";
+        if (KDB_DEBUG(AR))
+                kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
+                   "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
+                   symtab->sym_start, symtab->mod_name, symtab->sym_name,
+                   symtab->sym_name);
+out:
+        debug_kfree(knt1);
+        return ret;
+}
+void kdbnearsym_cleanup(void)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
+                if (kdb_name_table[i]) {
+                        debug_kfree(kdb_name_table[i]);
+                        kdb_name_table[i] = NULL;
+                }
+        }
+}
+static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
+/*
+ * kallsyms_symbol_complete
+ *
+ * Parameters:
+ *      prefix_name     prefix of a symbol name to lookup
+ *      max_len         maximum length that can be returned
+ * Returns:
+ *      Number of symbols which match the given prefix.
+ * Notes:
+ *      prefix_name is changed to contain the longest unique prefix that
+ *      starts with this prefix (tab completion).
+ */
+int kallsyms_symbol_complete(char *prefix_name, int max_len)
+{
+        loff_t pos = 0;
+        int prefix_len = strlen(prefix_name), prev_len = 0;
+        int i, number = 0;
+        const char *name;
+        while ((name = kdb_walk_kallsyms(&pos))) {
+                if (strncmp(name, prefix_name, prefix_len) == 0) {
+                        strcpy(ks_namebuf, name);
+                        /* Work out the longest name that matches the prefix */
+                        if (++number == 1) {
+                                prev_len = min_t(int, max_len-1,
+                                                 strlen(ks_namebuf));
+                                memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
+                                ks_namebuf_prev[prev_len] = '\0';
+                                continue;
+                        }
+                        for (i = 0; i < prev_len; i++) {
+                                if (ks_namebuf[i] != ks_namebuf_prev[i]) {
+                                        prev_len = i;
+                                        ks_namebuf_prev[i] = '\0';
+                                        break;
+                                }
+                        }
+                }
+        }
+        if (prev_len > prefix_len)
+                memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
+        return number;
+}
+/*
+ * kallsyms_symbol_next
+ *
+ * Parameters:
+ *      prefix_name     prefix of a symbol name to lookup
+ *      flag    0 means search from the head, 1 means continue search.
+ * Returns:
+ *      1 if a symbol matches the given prefix.
+ *      0 if no string found
+ */
+int kallsyms_symbol_next(char *prefix_name, int flag)
+{
+        int prefix_len = strlen(prefix_name);
+        static loff_t pos;
+        const char *name;
+        if (!flag)
+                pos = 0;
+        while ((name = kdb_walk_kallsyms(&pos))) {
+                if (strncmp(name, prefix_name, prefix_len) == 0) {
+                        strncpy(prefix_name, name, strlen(name)+1);
+                        return 1;
+                }
+        }
+        return 0;
+}
+/*
+ * kdb_symbol_print - Standard method for printing a symbol name and offset.
+ * Inputs:
+ *      addr    Address to be printed.
+ *      symtab  Address of symbol data, if NULL this routine does its
+ *              own lookup.
+ *      punc    Punctuation for string, bit field.
+ * Remarks:
+ *      The string and its punctuation is only printed if the address
+ *      is inside the kernel, except that the value is always printed
+ *      when requested.
+ */
+void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
+                      unsigned int punc)
+{
+        kdb_symtab_t symtab, *symtab_p2;
+        if (symtab_p) {
+                symtab_p2 = (kdb_symtab_t *)symtab_p;
+        } else {
+                symtab_p2 = &symtab;
+                kdbnearsym(addr, symtab_p2);
+        }
+        if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
+                return;
+        if (punc & KDB_SP_SPACEB)
+                kdb_printf(" ");
+        if (punc & KDB_SP_VALUE)
+                kdb_printf(kdb_machreg_fmt0, addr);
+        if (symtab_p2->sym_name) {
+                if (punc & KDB_SP_VALUE)
+                        kdb_printf(" ");
+                if (punc & KDB_SP_PAREN)
+                        kdb_printf("(");
+                if (strcmp(symtab_p2->mod_name, "kernel"))
+                        kdb_printf("[%s]", symtab_p2->mod_name);
+                kdb_printf("%s", symtab_p2->sym_name);
+                if (addr != symtab_p2->sym_start)
+                        kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
+                if (punc & KDB_SP_SYMSIZE)
+                        kdb_printf("/0x%lx",
+                                   symtab_p2->sym_end - symtab_p2->sym_start);
+                if (punc & KDB_SP_PAREN)
+                        kdb_printf(")");
+        }
+        if (punc & KDB_SP_SPACEA)
+                kdb_printf(" ");
+        if (punc & KDB_SP_NEWLINE)
+                kdb_printf("\n");
+}
+/*
+ * kdb_strdup - kdb equivalent of strdup, for disasm code.
+ * Inputs:
+ *      str     The string to duplicate.
+ *      type    Flags to kmalloc for the new string.
+ * Returns:
+ *      Address of the new string, NULL if storage could not be allocated.
+ * Remarks:
+ *      This is not in lib/string.c because it uses kmalloc which is not
+ *      available when string.o is used in boot loaders.
+ */
+char *kdb_strdup(const char *str, gfp_t type)
+{
+        int n = strlen(str)+1;
+        char *s = kmalloc(n, type);
+        if (!s)
+                return NULL;
+        return strcpy(s, str);
+}
+/*
+ * kdb_getarea_size - Read an area of data.  The kdb equivalent of
+ *      copy_from_user, with kdb messages for invalid addresses.
+ * Inputs:
+ *      res     Pointer to the area to receive the result.
+ *      addr    Address of the area to copy.
+ *      size    Size of the area.
+ * Returns:
+ *      0 for success, < 0 for error.
+ */
+int kdb_getarea_size(void *res, unsigned long addr, size_t size)
+{
+        int ret = probe_kernel_read((char *)res, (char *)addr, size);
+        if (ret) {
+                if (!KDB_STATE(SUPPRESS)) {
+                        kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
+                        KDB_STATE_SET(SUPPRESS);
+                }
+                ret = KDB_BADADDR;
+        } else {
+                KDB_STATE_CLEAR(SUPPRESS);
+        }
+        return ret;
+}
+/*
+ * kdb_putarea_size - Write an area of data.  The kdb equivalent of
+ *      copy_to_user, with kdb messages for invalid addresses.
+ * Inputs:
+ *      addr    Address of the area to write to.
+ *      res     Pointer to the area holding the data.
+ *      size    Size of the area.
+ * Returns:
+ *      0 for success, < 0 for error.
+ */
+int kdb_putarea_size(unsigned long addr, void *res, size_t size)
+{
+        int ret = probe_kernel_read((char *)addr, (char *)res, size);
+        if (ret) {
+                if (!KDB_STATE(SUPPRESS)) {
+                        kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
+                        KDB_STATE_SET(SUPPRESS);
+                }
+                ret = KDB_BADADDR;
+        } else {
+                KDB_STATE_CLEAR(SUPPRESS);
+        }
+        return ret;
+}
+/*
+ * kdb_getphys - Read data from a physical address. Validate the
+ *      address is in range, use kmap_atomic() to get data
+ *      similar to kdb_getarea() - but for phys addresses
+ * Inputs:
+ *      res     Pointer to the word to receive the result
+ *      addr    Physical address of the area to copy
+ *      size    Size of the area
+ * Returns:
+ *      0 for success, < 0 for error.
+ */
+static int kdb_getphys(void *res, unsigned long addr, size_t size)
+{
+        unsigned long pfn;
+        void *vaddr;
+        struct page *page;
+        pfn = (addr >> PAGE_SHIFT);
+        if (!pfn_valid(pfn))
+                return 1;
+        page = pfn_to_page(pfn);
+        vaddr = kmap_atomic(page, KM_KDB);
+        memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
+        kunmap_atomic(vaddr, KM_KDB);
+        return 0;
+}
+/*
+ * kdb_getphysword
+ * Inputs:
+ *      word    Pointer to the word to receive the result.
+ *      addr    Address of the area to copy.
+ *      size    Size of the area.
+ * Returns:
+ *      0 for success, < 0 for error.
+ */
+int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
+{
+        int diag;
+        __u8  w1;
+        __u16 w2;
+        __u32 w4;
+        __u64 w8;
+        *word = 0;      /* Default value if addr or size is invalid */
+        switch (size) {
+        case 1:
+                diag = kdb_getphys(&w1, addr, sizeof(w1));
+                if (!diag)
+                        *word = w1;
+                break;
+        case 2:
+                diag = kdb_getphys(&w2, addr, sizeof(w2));
+                if (!diag)
+                        *word = w2;
+                break;
+        case 4:
+                diag = kdb_getphys(&w4, addr, sizeof(w4));
+                if (!diag)
+                        *word = w4;
+                break;
+        case 8:
+                if (size <= sizeof(*word)) {
+                        diag = kdb_getphys(&w8, addr, sizeof(w8));
+                        if (!diag)
+                                *word = w8;
+                        break;
+                }
+                /* drop through */
+        default:
+                diag = KDB_BADWIDTH;
+                kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
+        }
+        return diag;
+}
+/*
+ * kdb_getword - Read a binary value.  Unlike kdb_getarea, this treats
+ *      data as numbers.
+ * Inputs:
+ *      word    Pointer to the word to receive the result.
+ *      addr    Address of the area to copy.
+ *      size    Size of the area.
+ * Returns:
+ *      0 for success, < 0 for error.
+ */
+int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
+{
+        int diag;
+        __u8  w1;
+        __u16 w2;
+        __u32 w4;
+        __u64 w8;
+        *word = 0;      /* Default value if addr or size is invalid */
+        switch (size) {
+        case 1:
+                diag = kdb_getarea(w1, addr);
+                if (!diag)
+                        *word = w1;
+                break;
+        case 2:
+                diag = kdb_getarea(w2, addr);
+                if (!diag)
+                        *word = w2;
+                break;
+        case 4:
+                diag = kdb_getarea(w4, addr);
+                if (!diag)
+                        *word = w4;
+                break;
+        case 8:
+                if (size <= sizeof(*word)) {
+                        diag = kdb_getarea(w8, addr);
+                        if (!diag)
+                                *word = w8;
+                        break;
+                }
+                /* drop through */
+        default:
+                diag = KDB_BADWIDTH;
+                kdb_printf("kdb_getword: bad width %ld\n", (long) size);
+        }
+        return diag;
+}
+/*
+ * kdb_putword - Write a binary value.  Unlike kdb_putarea, this
+ *      treats data as numbers.
+ * Inputs:
+ *      addr    Address of the area to write to..
+ *      word    The value to set.
+ *      size    Size of the area.
+ * Returns:
+ *      0 for success, < 0 for error.
+ */
+int kdb_putword(unsigned long addr, unsigned long word, size_t size)
+{
+        int diag;
+        __u8  w1;
+        __u16 w2;
+        __u32 w4;
+        __u64 w8;
+        switch (size) {
+        case 1:
+                w1 = word;
+                diag = kdb_putarea(addr, w1);
+                break;
+        case 2:
+                w2 = word;
+                diag = kdb_putarea(addr, w2);
+                break;
+        case 4:
+                w4 = word;
+                diag = kdb_putarea(addr, w4);
+                break;
+        case 8:
+                if (size <= sizeof(word)) {
+                        w8 = word;
+                        diag = kdb_putarea(addr, w8);
+                        break;
+                }
+                /* drop through */
+        default:
+                diag = KDB_BADWIDTH;
+                kdb_printf("kdb_putword: bad width %ld\n", (long) size);
+        }
+        return diag;
+}
+/*
+ * kdb_task_state_string - Convert a string containing any of the
+ *      letters DRSTCZEUIMA to a mask for the process state field and
+ *      return the value.  If no argument is supplied, return the mask
+ *      that corresponds to environment variable PS, DRSTCZEU by
+ *      default.
+ * Inputs:
+ *      s       String to convert
+ * Returns:
+ *      Mask for process state.
+ * Notes:
+ *      The mask folds data from several sources into a single long value, so
+ *      be carefull not to overlap the bits.  TASK_* bits are in the LSB,
+ *      special cases like UNRUNNABLE are in the MSB.  As of 2.6.10-rc1 there
+ *      is no overlap between TASK_* and EXIT_* but that may not always be
+ *      true, so EXIT_* bits are shifted left 16 bits before being stored in
+ *      the mask.
+ */
+/* unrunnable is < 0 */
+#define UNRUNNABLE      (1UL << (8*sizeof(unsigned long) - 1))
+#define RUNNING         (1UL << (8*sizeof(unsigned long) - 2))
+#define IDLE            (1UL << (8*sizeof(unsigned long) - 3))
+#define DAEMON          (1UL << (8*sizeof(unsigned long) - 4))
+unsigned long kdb_task_state_string(const char *s)
+{
+        long res = 0;
+        if (!s) {
+                s = kdbgetenv("PS");
+                if (!s)
+                        s = "DRSTCZEU"; /* default value for ps */
+        }
+        while (*s) {
+                switch (*s) {
+                case 'D':
+                        res |= TASK_UNINTERRUPTIBLE;
+                        break;
+                case 'R':
+                        res |= RUNNING;
+                        break;
+                case 'S':
+                        res |= TASK_INTERRUPTIBLE;
+                        break;
+                case 'T':
+                        res |= TASK_STOPPED;
+                        break;
+                case 'C':
+                        res |= TASK_TRACED;
+                        break;
+                case 'Z':
+                        res |= EXIT_ZOMBIE << 16;
+                        break;
+                case 'E':
+                        res |= EXIT_DEAD << 16;
+                        break;
+                case 'U':
+                        res |= UNRUNNABLE;
+                        break;
+                case 'I':
+                        res |= IDLE;
+                        break;
+                case 'M':
+                        res |= DAEMON;
+                        break;
+                case 'A':
+                        res = ~0UL;
+                        break;
+                default:
+                          kdb_printf("%s: unknown flag '%c' ignored\n",
+                                     __func__, *s);
+                          break;
+                }
+                ++s;
+        }
+        return res;
+}
+/*
+ * kdb_task_state_char - Return the character that represents the task state.
+ * Inputs:
+ *      p       struct task for the process
+ * Returns:
+ *      One character to represent the task state.
+ */
+char kdb_task_state_char (const struct task_struct *p)
+{
+        int cpu;
+        char state;
+        unsigned long tmp;
+        if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+                return 'E';
+        cpu = kdb_process_cpu(p);
+        state = (p->state == 0) ? 'R' :
+                (p->state < 0) ? 'U' :
+                (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+                (p->state & TASK_STOPPED) ? 'T' :
+                (p->state & TASK_TRACED) ? 'C' :
+                (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
+                (p->exit_state & EXIT_DEAD) ? 'E' :
+                (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+        if (p->pid == 0) {
+                /* Idle task.  Is it really idle, apart from the kdb
+                 * interrupt? */
+                if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
+                        if (cpu != kdb_initial_cpu)
+                                state = 'I';    /* idle task */
+                }
+        } else if (!p->mm && state == 'S') {
+                state = 'M';    /* sleeping system daemon */
+        }
+        return state;
+}
+/*
+ * kdb_task_state - Return true if a process has the desired state
+ *      given by the mask.
+ * Inputs:
+ *      p       struct task for the process
+ *      mask    mask from kdb_task_state_string to select processes
+ * Returns:
+ *      True if the process matches at least one criteria defined by the mask.
+ */
+unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+{
+        char state[] = { kdb_task_state_char(p), '\0' };
+        return (mask & kdb_task_state_string(state)) != 0;
+}
+/*
+ * kdb_print_nameval - Print a name and its value, converting the
+ *      value to a symbol lookup if possible.
+ * Inputs:
+ *      name    field name to print
+ *      val     value of field
+ */
+void kdb_print_nameval(const char *name, unsigned long val)
+{
+        kdb_symtab_t symtab;
+        kdb_printf("  %-11.11s ", name);
+        if (kdbnearsym(val, &symtab))
+                kdb_symbol_print(val, &symtab,
+                                 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
+        else
+                kdb_printf("0x%lx\n", val);
+}
+/* Last ditch allocator for debugging, so we can still debug even when
+ * the GFP_ATOMIC pool has been exhausted.  The algorithms are tuned
+ * for space usage, not for speed.  One smallish memory pool, the free
+ * chain is always in ascending address order to allow coalescing,
+ * allocations are done in brute force best fit.
+ */
+struct debug_alloc_header {
+        u32 next;       /* offset of next header from start of pool */
+        u32 size;
+        void *caller;
+};
+/* The memory returned by this allocator must be aligned, which means
+ * so must the header size.  Do not assume that sizeof(struct
+ * debug_alloc_header) is a multiple of the alignment, explicitly
+ * calculate the overhead of this header, including the alignment.
+ * The rest of this code must not use sizeof() on any header or
+ * pointer to a header.
+ */
+#define dah_align 8
+#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
+static u64 debug_alloc_pool_aligned[256*1024/dah_align];        /* 256K pool */
+static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
+static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
+/* Locking is awkward.  The debug code is called from all contexts,
+ * including non maskable interrupts.  A normal spinlock is not safe
+ * in NMI context.  Try to get the debug allocator lock, if it cannot
+ * be obtained after a second then give up.  If the lock could not be
+ * previously obtained on this cpu then only try once.
+ *
+ * sparse has no annotation for "this function _sometimes_ acquires a
+ * lock", so fudge the acquire/release notation.
+ */
+static DEFINE_SPINLOCK(dap_lock);
+static int get_dap_lock(void)
+        __acquires(dap_lock)
+{
+        static int dap_locked = -1;
+        int count;
+        if (dap_locked == smp_processor_id())
+                count = 1;
+        else
+                count = 1000;
+        while (1) {
+                if (spin_trylock(&dap_lock)) {
+                        dap_locked = -1;
+                        return 1;
+                }
+                if (!count--)
+                        break;
+                udelay(1000);
+        }
+        dap_locked = smp_processor_id();
+        __acquire(dap_lock);
+        return 0;
+}
+void *debug_kmalloc(size_t size, gfp_t flags)
+{
+        unsigned int rem, h_offset;
+        struct debug_alloc_header *best, *bestprev, *prev, *h;
+        void *p = NULL;
+        if (!get_dap_lock()) {
+                __release(dap_lock);    /* we never actually got it */
+                return NULL;
+        }
+        h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+        if (dah_first_call) {
+                h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
+                dah_first_call = 0;
+        }
+        size = ALIGN(size, dah_align);
+        prev = best = bestprev = NULL;
+        while (1) {
+                if (h->size >= size && (!best || h->size < best->size)) {
+                        best = h;
+                        bestprev = prev;
+                        if (h->size == size)
+                                break;
+                }
+                if (!h->next)
+                        break;
+                prev = h;
+                h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
+        }
+        if (!best)
+                goto out;
+        rem = best->size - size;
+        /* The pool must always contain at least one header */
+        if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
+                goto out;
+        if (rem >= dah_overhead) {
+                best->size = size;
+                h_offset = ((char *)best - debug_alloc_pool) +
+                           dah_overhead + best->size;
+                h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
+                h->size = rem - dah_overhead;
+                h->next = best->next;
+        } else
+                h_offset = best->next;
+        best->caller = __builtin_return_address(0);
+        dah_used += best->size;
+        dah_used_max = max(dah_used, dah_used_max);
+        if (bestprev)
+                bestprev->next = h_offset;
+        else
+                dah_first = h_offset;
+        p = (char *)best + dah_overhead;
+        memset(p, POISON_INUSE, best->size - 1);
+        *((char *)p + best->size - 1) = POISON_END;
+out:
+        spin_unlock(&dap_lock);
+        return p;
+}
+void debug_kfree(void *p)
+{
+        struct debug_alloc_header *h;
+        unsigned int h_offset;
+        if (!p)
+                return;
+        if ((char *)p < debug_alloc_pool ||
+            (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
+                kfree(p);
+                return;
+        }
+        if (!get_dap_lock()) {
+                __release(dap_lock);    /* we never actually got it */
+                return;         /* memory leak, cannot be helped */
+        }
+        h = (struct debug_alloc_header *)((char *)p - dah_overhead);
+        memset(p, POISON_FREE, h->size - 1);
+        *((char *)p + h->size - 1) = POISON_END;
+        h->caller = NULL;
+        dah_used -= h->size;
+        h_offset = (char *)h - debug_alloc_pool;
+        if (h_offset < dah_first) {
+                h->next = dah_first;
+                dah_first = h_offset;
+        } else {
+                struct debug_alloc_header *prev;
+                unsigned int prev_offset;
+                prev = (struct debug_alloc_header *)(debug_alloc_pool +
+                                                     dah_first);
+                while (1) {
+                        if (!prev->next || prev->next > h_offset)
+                                break;
+                        prev = (struct debug_alloc_header *)
+                                (debug_alloc_pool + prev->next);
+                }
+                prev_offset = (char *)prev - debug_alloc_pool;
+                if (prev_offset + dah_overhead + prev->size == h_offset) {
+                        prev->size += dah_overhead + h->size;
+                        memset(h, POISON_FREE, dah_overhead - 1);
+                        *((char *)h + dah_overhead - 1) = POISON_END;
+                        h = prev;
+                        h_offset = prev_offset;
+                } else {
+                        h->next = prev->next;
+                        prev->next = h_offset;
+                }
+        }
+        if (h_offset + dah_overhead + h->size == h->next) {
+                struct debug_alloc_header *next;
+                next = (struct debug_alloc_header *)
+                        (debug_alloc_pool + h->next);
+                h->size += dah_overhead + next->size;
+                h->next = next->next;
+                memset(next, POISON_FREE, dah_overhead - 1);
+                *((char *)next + dah_overhead - 1) = POISON_END;
+        }
+        spin_unlock(&dap_lock);
+}
+void debug_kusage(void)
+{
+        struct debug_alloc_header *h_free, *h_used;
+#ifdef  CONFIG_IA64
+        /* FIXME: using dah for ia64 unwind always results in a memory leak.
+         * Fix that memory leak first, then set debug_kusage_one_time = 1 for
+         * all architectures.
+         */
+        static int debug_kusage_one_time;
+#else
+        static int debug_kusage_one_time = 1;
+#endif
+        if (!get_dap_lock()) {
+                __release(dap_lock);    /* we never actually got it */
+                return;
+        }
+        h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
+        if (dah_first == 0 &&
+            (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
+             dah_first_call))
+                goto out;
+        if (!debug_kusage_one_time)
+                goto out;
+        debug_kusage_one_time = 0;
+        kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
+                   __func__, dah_first);
+        if (dah_first) {
+                h_used = (struct debug_alloc_header *)debug_alloc_pool;
+                kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
+                           h_used->size);
+        }
+        do {
+                h_used = (struct debug_alloc_header *)
+                          ((char *)h_free + dah_overhead + h_free->size);
+                kdb_printf("%s: h_used %p size %d caller %p\n",
+                           __func__, h_used, h_used->size, h_used->caller);
+                h_free = (struct debug_alloc_header *)
+                          (debug_alloc_pool + h_free->next);
+        } while (h_free->next);
+        h_used = (struct debug_alloc_header *)
+                  ((char *)h_free + dah_overhead + h_free->size);
+        if ((char *)h_used - debug_alloc_pool !=
+            sizeof(debug_alloc_pool_aligned))
+                kdb_printf("%s: h_used %p size %d caller %p\n",
+                           __func__, h_used, h_used->size, h_used->caller);
+out:
+        spin_unlock(&dap_lock);
+}
+/* Maintain a small stack of kdb_flags to allow recursion without disturbing
+ * the global kdb state.
+ */
+static int kdb_flags_stack[4], kdb_flags_index;
+void kdb_save_flags(void)
+{
+        BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
+        kdb_flags_stack[kdb_flags_index++] = kdb_flags;
+}
+void kdb_restore_flags(void)
+{
+        BUG_ON(kdb_flags_index <= 0);
+        kdb_flags = kdb_flags_stack[--kdb_flags_index];
+}
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..7bfae887f211
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,590 @@
+/*
+ * early_res, could be used to replace bootmem
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/early_res.h>
+#include <linux/slab.h>
+#include <linux/kmemleak.h>
+/*
+ * Early reserved memory areas.
+ */
+/*
+ * need to make sure this one is bigger enough before
+ * find_fw_memmap_area could be used
+ */
+#define MAX_EARLY_RES_X 32
+struct early_res {
+        u64 start, end;
+        char name[15];
+        char overlap_ok;
+};
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata;
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                if (end > r->start && start < r->end)
+                        break;
+        }
+        return i;
+}
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+        int j;
+        for (j = i + 1; j < max_early_res && early_res[j].end; j++)
+                ;
+        memmove(&early_res[i], &early_res[i + 1],
+               (j - 1 - i) * sizeof(struct early_res));
+        early_res[j - 1].end = 0;
+        early_res_count--;
+}
+static void __init drop_range_partial(int i, u64 start, u64 end)
+{
+        u64 common_start, common_end;
+        u64 old_start, old_end;
+        old_start = early_res[i].start;
+        old_end = early_res[i].end;
+        common_start = max(old_start, start);
+        common_end = min(old_end, end);
+        /* no overlap ? */
+        if (common_start >= common_end)
+                return;
+        if (old_start < common_start) {
+                /* make head segment */
+                early_res[i].end = common_start;
+                if (old_end > common_end) {
+                        char name[15];
+                        /*
+                         * Save a local copy of the name, since the
+                         * early_res array could get resized inside
+                         * reserve_early_without_check() ->
+                         * __check_and_double_early_res(), which would
+                         * make the current name pointer invalid.
+                         */
+                        strncpy(name, early_res[i].name,
+                                         sizeof(early_res[i].name) - 1);
+                        /* add another for left over on tail */
+                        reserve_early_without_check(common_end, old_end, name);
+                }
+                return;
+        } else {
+                if (old_end > common_end) {
+                        /* reuse the entry for tail left */
+                        early_res[i].start = common_end;
+                        return;
+                }
+                /* all covered */
+                drop_range(i);
+        }
+}
+/*
+ * Split any existing ranges that:
+ *  1) are marked 'overlap_ok', and
+ *  2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range.  Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        u64 lower_start, lower_end;
+        u64 upper_start, upper_end;
+        char name[15];
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                /* Continue past non-overlapping ranges */
+                if (end <= r->start || start >= r->end)
+                        continue;
+                /*
+                 * Leave non-ok overlaps as is; let caller
+                 * panic "Overlapping early reservations"
+                 * when it hits this overlap.
+                 */
+                if (!r->overlap_ok)
+                        return;
+                /*
+                 * We have an ok overlap.  We will drop it from the early
+                 * reservation map, and add back in any non-overlapping
+                 * portions (lower or upper) as separate, overlap_ok,
+                 * non-overlapping ranges.
+                 */
+                /* 1. Note any non-overlapping (lower or upper) ranges. */
+                strncpy(name, r->name, sizeof(name) - 1);
+                lower_start = lower_end = 0;
+                upper_start = upper_end = 0;
+                if (r->start < start) {
+                        lower_start = r->start;
+                        lower_end = start;
+                }
+                if (r->end > end) {
+                        upper_start = end;
+                        upper_end = r->end;
+                }
+                /* 2. Drop the original ok overlapping range */
+                drop_range(i);
+                i--;            /* resume for-loop on copied down entry */
+                /* 3. Add back in any non-overlapping ranges. */
+                if (lower_end)
+                        reserve_early_overlap_ok(lower_start, lower_end, name);
+                if (upper_end)
+                        reserve_early_overlap_ok(upper_start, upper_end, name);
+        }
+}
+static void __init __reserve_early(u64 start, u64 end, char *name,
+                                                int overlap_ok)
+{
+        int i;
+        struct early_res *r;
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                panic("Too many early reservations");
+        r = &early_res[i];
+        if (r->end)
+                panic("Overlapping early reservations "
+                      "%llx-%llx %s to %llx-%llx %s\n",
+                      start, end - 1, name ? name : "", r->start,
+                      r->end - 1, r->name);
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = overlap_ok;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first.  We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 1);
+}
+static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
+{
+        u64 start, end, size, mem;
+        struct early_res *new;
+        /* do we have enough slots left ? */
+        if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+                return;
+        /* double it */
+        mem = -1ULL;
+        size = sizeof(struct early_res) * max_early_res * 2;
+        if (early_res == early_res_x)
+                start = 0;
+        else
+                start = early_res[0].end;
+        end = ex_start;
+        if (start + size < end)
+                mem = find_fw_memmap_area(start, end, size,
+                                         sizeof(struct early_res));
+        if (mem == -1ULL) {
+                start = ex_end;
+                end = get_max_mapped();
+                if (start + size < end)
+                        mem = find_fw_memmap_area(start, end, size,
+                                                 sizeof(struct early_res));
+        }
+        if (mem == -1ULL)
+                panic("can not find more space for early_res array");
+        new = __va(mem);
+        /* save the first one for own */
+        new[0].start = mem;
+        new[0].end = mem + size;
+        new[0].overlap_ok = 0;
+        /* copy old to new */
+        if (early_res == early_res_x) {
+                memcpy(&new[1], &early_res[0],
+                         sizeof(struct early_res) * max_early_res);
+                memset(&new[max_early_res+1], 0,
+                         sizeof(struct early_res) * (max_early_res - 1));
+                early_res_count++;
+        } else {
+                memcpy(&new[1], &early_res[1],
+                         sizeof(struct early_res) * (max_early_res - 1));
+                memset(&new[max_early_res], 0,
+                         sizeof(struct early_res) * max_early_res);
+        }
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = new;
+        max_early_res *= 2;
+        printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+                max_early_res, mem, mem + size - 1);
+}
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 0);
+}
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+        struct early_res *r;
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        r = &early_res[early_res_count];
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = 0;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+void __init free_early(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+        kmemleak_free_part(__va(start), end - start);
+        i = find_overlapped_early(start, end);
+        r = &early_res[i];
+        if (i >= max_early_res || r->end != end || r->start != start)
+                panic("free_early on not reserved area: %llx-%llx!",
+                         start, end - 1);
+        drop_range(i);
+}
+void __init free_early_partial(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+        kmemleak_free_part(__va(start), end - start);
+        if (start == end)
+                return;
+        if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
+                return;
+try_next:
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                return;
+        r = &early_res[i];
+        /* hole ? */
+        if (r->end >= end && r->start <= start) {
+                drop_range_partial(i, start, end);
+                return;
+        }
+        drop_range_partial(i, start, end);
+        goto try_next;
+}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+#define DEBUG_PRINT_EARLY_RES 1
+#if DEBUG_PRINT_EARLY_RES
+        printk(KERN_INFO "Subtract (%d early reservations)\n", count);
+#endif
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+#if DEBUG_PRINT_EARLY_RES
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
+                        r->start, r->end, r->name);
+#endif
+                final_start = PFN_DOWN(r->start);
+                final_end = PFN_UP(r->end);
+                if (final_start >= final_end)
+                        continue;
+                subtract_range(range, az, final_start, final_end);
+        }
+}
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+        int i, count;
+        u64 start = 0, end;
+        u64 size;
+        u64 mem;
+        struct range *range;
+        int nr_range;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        count *= 2;
+        size = sizeof(struct range) * count;
+        end = get_max_mapped();
+#ifdef MAX_DMA32_PFN
+        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
+                start = MAX_DMA32_PFN << PAGE_SHIFT;
+#endif
+        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
+        if (mem == -1ULL)
+                panic("can not find more space for range free");
+        range = __va(mem);
+        /* use early_node_map[] and early_res to get range array at first */
+        memset(range, 0, size);
+        nr_range = 0;
+        /* need to go over early_node_map to find out good range for node */
+        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+#ifdef CONFIG_X86_32
+        subtract_range(range, count, max_low_pfn, -1ULL);
+#endif
+        subtract_early_res(range, count);
+        nr_range = clean_sort_range(range, count);
+        /* need to clear it ? */
+        if (nodeid == MAX_NUMNODES) {
+                memset(&early_res[0], 0,
+                         sizeof(struct early_res) * max_early_res);
+                early_res = NULL;
+                max_early_res = 0;
+        }
+        *rangep = range;
+        return nr_range;
+}
+#else
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+        printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+                         count - idx, max_early_res, start, end);
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
+                        r->start, r->end, r->name);
+                final_start = max(start, r->start);
+                final_end = min(end, r->end);
+                if (final_start >= final_end) {
+                        printk(KERN_CONT "\n");
+                        continue;
+                }
+                printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+                        final_start, final_end);
+                reserve_bootmem_generic(final_start, final_end - final_start,
+                                BOOTMEM_DEFAULT);
+        }
+        /* clear them */
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = NULL;
+        max_early_res = 0;
+        early_res_count = 0;
+}
+#endif
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+        int i;
+        u64 addr = *addrp;
+        int changed = 0;
+        struct early_res *r;
+again:
+        i = find_overlapped_early(addr, addr + size);
+        r = &early_res[i];
+        if (i < max_early_res && r->end) {
+                *addrp = addr = round_up(r->end, align);
+                changed = 1;
+                goto again;
+        }
+        return changed;
+}
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+        int i;
+        u64 addr = *addrp, last;
+        u64 size = *sizep;
+        int changed = 0;
+again:
+        last = addr + size;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                struct early_res *r = &early_res[i];
+                if (last > r->start && addr < r->start) {
+                        size = r->start - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last > r->end && addr < r->end) {
+                        addr = round_up(r->end, align);
+                        size = last - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last <= r->end && addr >= r->start) {
+                        (*sizep)++;
+                        return 0;
+                }
+        }
+        if (changed) {
+                *addrp = addr;
+                *sizep = size;
+        }
+        return changed;
+}
+/*
+ * Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+                         u64 size, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+                ;
+        last = addr + size;
+        if (last > ei_last)
+                goto out;
+        if (last > end)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
+u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
+                         u64 *sizep, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        *sizep = ei_last - addr;
+        while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
+                ;
+        last = addr + *sizep;
+        if (last > ei_last)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/elf.h>
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+        return 0;
+}
+int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
+                                      unsigned long limit)
+{
+        return 1;
+}
+int __weak elf_core_write_extra_data(struct file *file, size_t *size,
+                                     unsigned long limit)
+{
+        return 1;
+}
+size_t __weak elf_core_extra_data_size(void)
+{
+        return 0;
+}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..dd62f8e714ca 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
 static DEFINE_RWLOCK(exec_domains_lock);
-static u_long ident_map[32] = {
+static unsigned long ident_map[32] = {
        0,      1,      2,      3,      4,      5,      6,      7,
        8,      9,      10,     11,     12,     13,     14,     15,
        16,     17,     18,     19,     20,     21,     22,     23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
 }
 static struct exec_domain *
-lookup_exec_domain(u_long personality)
+lookup_exec_domain(unsigned int personality)
 {
-        struct exec_domain *    ep;
+        unsigned int pers = personality(personality);
-        u_long                  pers = personality(personality);
+        struct exec_domain *ep;
        read_lock(&exec_domains_lock);
        for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
 #ifdef CONFIG_MODULES
        read_unlock(&exec_domains_lock);
-        request_module("personality-%ld", pers);
+        request_module("personality-%d", pers);
        read_lock(&exec_domains_lock);
        for (ep = exec_domains; ep; ep = ep->next) {
@@ -135,7 +135,7 @@ unregister:
 }
 int
-__set_personality(u_long personality)
+__set_personality(unsigned int personality)
 {
        struct exec_domain      *ep, *oep;
@@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void)
 module_init(proc_execdomains_init);
 #endif
-SYSCALL_DEFINE1(personality, u_long, personality)
+SYSCALL_DEFINE1(personality, unsigned int, personality)
 {
-        u_long old = current->personality;
+        unsigned int old = current->personality;
        if (personality != 0xffffffff) {
                set_personality(personality);
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality)
                        return -EINVAL;
        }
-        return (long)old;
+        return old;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a66..ceffc67b564a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,15 +55,14 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include "cred-internals.h"
 static void exit_mm(struct task_struct * tsk);
-static void __unhash_process(struct task_struct *p)
+static void __unhash_process(struct task_struct *p, bool group_dead)
 {
        nr_threads--;
        detach_pid(p, PIDTYPE_PID);
-        if (thread_group_leader(p)) {
+        if (group_dead) {
                detach_pid(p, PIDTYPE_PGID);
                detach_pid(p, PIDTYPE_SID);
@@ -80,23 +79,26 @@ static void __unhash_process(struct task_struct *p)
 static void __exit_signal(struct task_struct *tsk)
 {
        struct signal_struct *sig = tsk->signal;
+        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
+        struct tty_struct *uninitialized_var(tty);
-        BUG_ON(!sig);
+        sighand = rcu_dereference_check(tsk->sighand,
-        BUG_ON(!atomic_read(&sig->count));
+                                        rcu_read_lock_held() ||
+                                        lockdep_tasklist_lock_is_held());
-        sighand = rcu_dereference(tsk->sighand);
        spin_lock(&sighand->siglock);
        posix_cpu_timers_exit(tsk);
-        if (atomic_dec_and_test(&sig->count))
+        if (group_dead) {
                posix_cpu_timers_exit_group(tsk);
-        else {
+                tty = sig->tty;
+                sig->tty = NULL;
+        } else {
                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
-                if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
+                if (sig->notify_count > 0 && !--sig->notify_count)
                        wake_up_process(sig->group_exit_task);
                if (tsk == sig->curr_target)
@@ -122,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
                sig->oublock += task_io_get_oublock(tsk);
                task_io_accounting_add(&sig->ioac, &tsk->ioac);
                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
-                sig = NULL; /* Marker for below. */
        }
-        __unhash_process(tsk);
+        sig->nr_threads--;
+        __unhash_process(tsk, group_dead);
        /*
         * Do this under ->siglock, we can race with another thread
         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
         */
        flush_sigqueue(&tsk->pending);
-        tsk->signal = NULL;
        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);
        __cleanup_sighand(sighand);
        clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-        if (sig) {
+        if (group_dead) {
                flush_sigqueue(&sig->shared_pending);
-                taskstats_tgid_free(sig);
+                tty_kref_put(tty);
-                /*
-                 * Make sure ->signal can't go away under rq->lock,
-                 * see account_group_exec_runtime().
-                 */
-                task_rq_unlock_wait(tsk);
-                __cleanup_signal(sig);
        }
 }
@@ -170,8 +164,10 @@ void release_task(struct task_struct * p)
 repeat:
        tracehook_prepare_release_task(p);
        /* don't need to get the RCU readlock here - the process is dead and
-         * can't be modifying its own credentials */
+         * can't be modifying its own credentials. But shut RCU-lockdep up */
+        rcu_read_lock();
        atomic_dec(&__task_cred(p)->user->processes);
+        rcu_read_unlock();
        proc_flush_task(p);
@@ -473,9 +469,11 @@ static void close_files(struct files_struct * files)
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
-         * files structure.
+         * files structure.  But use RCU to shut RCU-lockdep up.
         */
+        rcu_read_lock();
        fdt = files_fdtable(files);
+        rcu_read_unlock();
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
@@ -521,10 +519,12 @@ void put_files_struct(struct files_struct *files)
                 * at the end of the RCU grace period. Otherwise,
                 * you can free files immediately.
                 */
+                rcu_read_lock();
                fdt = files_fdtable(files);
                if (fdt != &files->fdtab)
                        kmem_cache_free(files_cachep, files);
                free_fdtable(fdt);
+                rcu_read_unlock();
        }
 }
@@ -849,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
-        /* mt-exec, de_thread() is waiting for us */
+        /* mt-exec, de_thread() is waiting for group leader */
-        if (thread_group_leader(tsk) &&
+        if (unlikely(tsk->signal->notify_count < 0))
-            tsk->signal->group_exit_task &&
-            tsk->signal->notify_count < 0)
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);
        tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -944,7 +941,9 @@ NORET_TYPE void do_exit(long code)
                                preempt_count());
        acct_update_integrals(tsk);
+        /* sync mm's RSS info before statistics gathering */
+        if (tsk->mm)
+                sync_mm_rss(tsk, tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
@@ -993,8 +992,10 @@ NORET_TYPE void do_exit(long code)
        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
+        task_lock(tsk);
        mpol_put(tsk->mempolicy);
        tsk->mempolicy = NULL;
+        task_unlock(tsk);
 #endif
 #ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
@@ -1180,7 +1181,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        if (unlikely(wo->wo_flags & WNOWAIT)) {
                int exit_code = p->exit_code;
-                int why, status;
+                int why;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc2..b6cce14ba047 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+        return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
 int nr_processes(void)
 {
        int cpu;
@@ -157,6 +165,18 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
+static inline void free_signal_struct(struct signal_struct *sig)
+{
+        taskstats_tgid_free(sig);
+        kmem_cache_free(signal_cachep, sig);
+}
+static inline void put_signal_struct(struct signal_struct *sig)
+{
+        if (atomic_dec_and_test(&sig->sigcnt))
+                free_signal_struct(sig);
+}
 void __put_task_struct(struct task_struct *tsk)
 {
        WARN_ON(!tsk->exit_state);
@@ -165,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
+        put_signal_struct(tsk->signal);
        if (!profile_handoff_task(tsk))
                free_task(tsk);
@@ -328,15 +349,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
+                INIT_LIST_HEAD(&tmp->anon_vma_chain);
                pol = mpol_dup(vma_policy(mpnt));
                retval = PTR_ERR(pol);
                if (IS_ERR(pol))
                        goto fail_nomem_policy;
                vma_set_policy(tmp, pol);
+                if (anon_vma_fork(tmp, mpnt))
+                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~VM_LOCKED;
                tmp->vm_mm = mm;
                tmp->vm_next = NULL;
-                anon_vma_link(tmp);
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +414,8 @@ out:
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
        return retval;
+fail_nomem_anon_vma_fork:
+        mpol_put(pol);
 fail_nomem_policy:
        kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
@@ -454,8 +479,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
                (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
        mm->core_state = NULL;
        mm->nr_ptes = 0;
-        set_mm_counter(mm, file_rss, 0);
+        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
-        set_mm_counter(mm, anon_rss, 0);
        spin_lock_init(&mm->page_table_lock);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
@@ -824,23 +848,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 */
 static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
+        unsigned long cpu_limit;
        /* Thread group counters. */
        thread_group_cputime_init(sig);
-        /* Expiration times and increments. */
+        cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        sig->it[CPUCLOCK_PROF].expires = cputime_zero;
+        if (cpu_limit != RLIM_INFINITY) {
-        sig->it[CPUCLOCK_PROF].incr = cputime_zero;
+                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
-        sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
-        sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-        /* Cached expiration times. */
-        sig->cputime_expires.prof_exp = cputime_zero;
-        sig->cputime_expires.virt_exp = cputime_zero;
-        sig->cputime_expires.sched_exp = 0;
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-                sig->cputime_expires.prof_exp =
-                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
                sig->cputimer.running = 1;
        }
@@ -857,54 +872,30 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        if (clone_flags & CLONE_THREAD)
                return 0;
-        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;
-        atomic_set(&sig->count, 1);
+        sig->nr_threads = 1;
        atomic_set(&sig->live, 1);
+        atomic_set(&sig->sigcnt, 1);
        init_waitqueue_head(&sig->wait_chldexit);
-        sig->flags = 0;
        if (clone_flags & CLONE_NEWPID)
                sig->flags |= SIGNAL_UNKILLABLE;
-        sig->group_exit_code = 0;
-        sig->group_exit_task = NULL;
-        sig->group_stop_count = 0;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->leader = 0;        /* session leadership doesn't inherit */
-        sig->tty_old_pgrp = NULL;
-        sig->tty = NULL;
-        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
-        sig->gtime = cputime_zero;
-        sig->cgtime = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        sig->prev_utime = sig->prev_stime = cputime_zero;
-#endif
-        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
-        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
-        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-        sig->maxrss = sig->cmaxrss = 0;
-        task_io_accounting_init(&sig->ioac);
-        sig->sum_sched_runtime = 0;
-        taskstats_tgid_init(sig);
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
        posix_cpu_timers_init_group(sig);
-        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
@@ -912,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        return 0;
 }
-void __cleanup_signal(struct signal_struct *sig)
-{
-        thread_group_cputime_free(sig);
-        tty_kref_put(sig->tty);
-        kmem_cache_free(signal_cachep, sig);
-}
 static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long new_flags = p->flags;
@@ -1033,7 +1017,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        retval = -EAGAIN;
        if (atomic_read(&p->real_cred->user->processes) >=
-                        p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+                        task_rlimit(p, RLIMIT_NPROC)) {
                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
@@ -1075,6 +1059,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->prev_utime = cputime_zero;
        p->prev_stime = cputime_zero;
 #endif
+#if defined(SPLIT_RSS_COUNTING)
+        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#endif
        p->default_timer_slack_ns = current->timer_slack_ns;
@@ -1132,10 +1119,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->memcg_batch.memcg = NULL;
 #endif
-        p->bts = NULL;
-        p->stack_start = stack_start;
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
@@ -1241,21 +1224,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
-        /*
-         * The task hasn't been attached yet, so its cpus_allowed mask will
-         * not be changed, nor will its assigned CPU.
-         *
-         * The cpus_allowed mask of the parent may have changed after it was
-         * copied first time - so re-copy it here, then check the child's CPU
-         * to ensure it is on a valid CPU (and if not, just force it back to
-         * parent's CPU). This avoids alot of nasty races.
-         */
-        p->cpus_allowed = current->cpus_allowed;
-        p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
-        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-                        !cpu_online(task_cpu(p))))
-                set_task_cpu(p, smp_processor_id());
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
@@ -1284,8 +1252,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        }
        if (clone_flags & CLONE_THREAD) {
-                atomic_inc(&current->signal->count);
+                current->signal->nr_threads++;
                atomic_inc(&current->signal->live);
+                atomic_inc(&current->signal->sigcnt);
                p->group_leader = current->group_leader;
                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
        }
@@ -1298,7 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                p->nsproxy->pid_ns->child_reaper = p;
                        p->signal->leader_pid = pid;
-                        tty_kref_put(p->signal->tty);
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1331,7 +1299,7 @@ bad_fork_cleanup_mm:
                mmput(p->mm);
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
-                __cleanup_signal(p->signal);
+                free_signal_struct(p->signal);
 bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
@@ -1366,6 +1334,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
        return regs;
 }
+static inline void init_idle_pids(struct pid_link *links)
+{
+        enum pid_type type;
+        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
+                INIT_HLIST_NODE(&links[type].node); /* not really needed */
+                links[type].pid = &init_struct_pid;
+        }
+}
 struct task_struct * __cpuinit fork_idle(int cpu)
 {
        struct task_struct *task;
@@ -1373,8 +1351,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
                            &init_struct_pid, 0);
-        if (!IS_ERR(task))
+        if (!IS_ERR(task)) {
+                init_idle_pids(task->pids);
                init_idle(task, cpu);
+        }
        return task;
 }
@@ -1546,14 +1526,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
                *flags_ptr |= CLONE_SIGHAND;
        /*
-         * If unsharing signal handlers and the task was created
-         * using CLONE_THREAD, then must unshare the thread
-         */
-        if ((*flags_ptr & CLONE_SIGHAND) &&
-            (atomic_read(&current->signal->count) > 1))
-                *flags_ptr |= CLONE_THREAD;
-        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (*flags_ptr & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9d..6a3a5fa1526d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
 static struct task_struct * futex_find_get_task(pid_t pid)
 {
        struct task_struct *p;
-        const struct cred *cred = current_cred(), *pcred;
        rcu_read_lock();
        p = find_task_by_vpid(pid);
-        if (!p) {
+        if (p)
-                p = ERR_PTR(-ESRCH);
+                get_task_struct(p);
-        } else {
-                pcred = __task_cred(p);
-                if (cred->euid != pcred->euid &&
-                    cred->euid != pcred->uid)
-                        p = ERR_PTR(-ESRCH);
-                else
-                        get_task_struct(p);
-        }
        rcu_read_unlock();
@@ -530,8 +521,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                                return -EINVAL;
                        WARN_ON(!atomic_read(&pi_state->refcount));
-                        WARN_ON(pid && pi_state->owner &&
-                                pi_state->owner->pid != pid);
+                        /*
+                         * When pi_state->owner is NULL then the owner died
+                         * and another waiter is on the fly. pi_state->owner
+                         * is fixed up by the task which acquires
+                         * pi_state->rt_mutex.
+                         *
+                         * We do not check for pid == 0 which can happen when
+                         * the owner died and robust_list_exit() cleared the
+                         * TID.
+                         */
+                        if (pid && pi_state->owner) {
+                                /*
+                                 * Bail out if user space manipulated the
+                                 * futex value.
+                                 */
+                                if (pid != task_pid_vnr(pi_state->owner))
+                                        return -EINVAL;
+                        }
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
@@ -547,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        if (!pid)
                return -ESRCH;
        p = futex_find_get_task(pid);
-        if (IS_ERR(p))
+        if (!p)
-                return PTR_ERR(p);
+                return -ESRCH;
        /*
         * We need to look at the task state flags to figure out,
@@ -758,6 +766,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!pi_state)
                return -EINVAL;
+        /*
+         * If current does not own the pi_state then the futex is
+         * inconsistent and user space fiddled with the futex value.
+         */
+        if (pi_state->owner != current)
+                return -EINVAL;
        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
@@ -1971,7 +1986,7 @@ retry_private:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-        goto out;
+        goto out_put_key;
 out_unlock_put_key:
        queue_unlock(&q, hb);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                struct task_struct *p;
                ret = -ESRCH;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                    !capable(CAP_SYS_PTRACE))
                        goto err_unlock;
                head = p->compat_robust_list;
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        }
        if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
        return put_user(ptr_to_compat(head), head_ptr);
 err_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
 */
 int set_groups(struct cred *new, struct group_info *group_info)
 {
-        int retval;
-        retval = security_task_setgroups(group_info);
-        if (retval)
-                return retval;
        put_group_info(new->group_info);
        groups_sort(group_info);
        get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..5c69e996bd0f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
        do {
                seq = read_seqbegin(&xtime_lock);
-                xts = current_kernel_time();
+                xts = __current_kernel_time();
                tom = wall_to_monotonic;
        } while (read_seqretry(&xtime_lock, seq));
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
 }
 /**
- * schedule_hrtimeout_range - sleep until timeout
+ * schedule_hrtimeout_range_clock - sleep until timeout
 * @expires:    timeout value (ktime_t)
 * @delta:      slack in expires timeout (ktime_t)
 * @mode:       timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
- *
+ * @clock:      timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
- * Make the current task sleep until the given expiry time has
- * elapsed. The routine will return immediately unless
- * the current task state has been set (see set_current_state()).
- *
- * The @delta argument gives the kernel the freedom to schedule the
- * actual wakeup to a time that is both power and performance friendly.
- * The kernel give the normal best effort behavior for "@expires+@delta",
- * but may decide to fire the timer earlier, but no earlier than @expires.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- *
- * Returns 0 when the timer has expired otherwise -EINTR
 */
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+int __sched
-                               const enum hrtimer_mode mode)
+schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+                               const enum hrtimer_mode mode, int clock)
 {
        struct hrtimer_sleeper t;
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
                return -EINTR;
        }
-        hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
+        hrtimer_init_on_stack(&t.timer, clock, mode);
        hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
        hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
        return !t.task ? 0 : -EINTR;
 }
+/**
+ * schedule_hrtimeout_range - sleep until timeout
+ * @expires:    timeout value (ktime_t)
+ * @delta:      slack in expires timeout (ktime_t)
+ * @mode:       timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ *
+ * Make the current task sleep until the given expiry time has
+ * elapsed. The routine will return immediately unless
+ * the current task state has been set (see set_current_state()).
+ *
+ * The @delta argument gives the kernel the freedom to schedule the
+ * actual wakeup to a time that is both power and performance friendly.
+ * The kernel give the normal best effort behavior for "@expires+@delta",
+ * but may decide to fire the timer earlier, but no earlier than @expires.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
+ * pass before the routine returns.
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task.
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ *
+ * Returns 0 when the timer has expired otherwise -EINTR
+ */
+int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+                                     const enum hrtimer_mode mode)
+{
+        return schedule_hrtimeout_range_clock(expires, delta, mode,
+                                              CLOCK_MONOTONIC);
+}
 EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
 /**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 50dbd5999588..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/hw_breakpoint.h>
 /*
 * Constraints data
 */
 /* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
 /* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
+static int nr_slots[TYPE_MAX];
+static int constraints_initialized;
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
 /* Serialize accesses to the above constraints */
 static DEFINE_MUTEX(nr_bp_mutex);
+__weak int hw_breakpoint_weight(struct perf_event *bp)
+{
+        return 1;
+}
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+        if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+                return TYPE_DATA;
+        return TYPE_INST;
+}
 /*
 * Report the maximum number of pinned breakpoints a task
 * have in this cpu
 */
-static unsigned int max_task_bp_pinned(int cpu)
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
        int i;
-        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
-        for (i = HBP_NUM -1; i >= 0; i--) {
+        for (i = nr_slots[type] - 1; i >= 0; i--) {
                if (tsk_pinned[i] > 0)
                        return i + 1;
        }
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
        return 0;
 }
-static int task_bp_pinned(struct task_struct *tsk)
+static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
 {
        struct perf_event_context *ctx = tsk->perf_event_ctxp;
        struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
         */
        list_for_each_entry(bp, list, event_entry) {
                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-                        count++;
+                        if (find_slot_idx(bp) == type)
+                                count += hw_breakpoint_weight(bp);
        }
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
 * a given cpu (cpu > -1) or in all of them (cpu = -1).
 */
 static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+                    enum bp_type_idx type)
 {
        int cpu = bp->cpu;
        struct task_struct *tsk = bp->ctx->task;
        if (cpu >= 0) {
-                slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+                slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
                if (!tsk)
-                        slots->pinned += max_task_bp_pinned(cpu);
+                        slots->pinned += max_task_bp_pinned(cpu, type);
                else
-                        slots->pinned += task_bp_pinned(tsk);
+                        slots->pinned += task_bp_pinned(tsk, type);
-                slots->flexible = per_cpu(nr_bp_flexible, cpu);
+                slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
                return;
        }
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
        for_each_online_cpu(cpu) {
                unsigned int nr;
-                nr = per_cpu(nr_cpu_bp_pinned, cpu);
+                nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
                if (!tsk)
-                        nr += max_task_bp_pinned(cpu);
+                        nr += max_task_bp_pinned(cpu, type);
                else
-                        nr += task_bp_pinned(tsk);
+                        nr += task_bp_pinned(tsk, type);
                if (nr > slots->pinned)
                        slots->pinned = nr;
-                nr = per_cpu(nr_bp_flexible, cpu);
+                nr = per_cpu(nr_bp_flexible[type], cpu);
                if (nr > slots->flexible)
                        slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 }
 /*
+ * For now, continue to consider flexible as pinned, until we can
+ * ensure no flexible event can ever be scheduled before a pinned event
+ * in a same cpu.
+ */
+static void
+fetch_this_slot(struct bp_busy_slots *slots, int weight)
+{
+        slots->pinned += weight;
+}
+/*
 * Add a pinned breakpoint for the given task in our constraint table
 */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+                                enum bp_type_idx type, int weight)
 {
        unsigned int *tsk_pinned;
-        int count = 0;
+        int old_count = 0;
+        int old_idx = 0;
+        int idx = 0;
-        count = task_bp_pinned(tsk);
+        old_count = task_bp_pinned(tsk, type);
+        old_idx = old_count - 1;
+        idx = old_idx + weight;
-        tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+        tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
        if (enable) {
-                tsk_pinned[count]++;
+                tsk_pinned[idx]++;
-                if (count > 0)
+                if (old_count > 0)
-                        tsk_pinned[count-1]--;
+                        tsk_pinned[old_idx]--;
        } else {
-                tsk_pinned[count]--;
+                tsk_pinned[idx]--;
-                if (count > 0)
+                if (old_count > 0)
-                        tsk_pinned[count-1]++;
+                        tsk_pinned[old_idx]++;
        }
 }
 /*
 * Add/remove the given breakpoint in our constraint table
 */
-static void toggle_bp_slot(struct perf_event *bp, bool enable)
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
+               int weight)
 {
        int cpu = bp->cpu;
        struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
        /* Pinned counter task profiling */
        if (tsk) {
                if (cpu >= 0) {
-                        toggle_bp_task_slot(tsk, cpu, enable);
+                        toggle_bp_task_slot(tsk, cpu, enable, type, weight);
                        return;
                }
                for_each_online_cpu(cpu)
-                        toggle_bp_task_slot(tsk, cpu, enable);
+                        toggle_bp_task_slot(tsk, cpu, enable, type, weight);
                return;
        }
        /* Pinned counter cpu profiling */
        if (enable)
-                per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+                per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
        else
-                per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+                per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
 }
 /*
@@ -243,38 +282,112 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
 *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
 */
-int reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp)
 {
        struct bp_busy_slots slots = {0};
-        int ret = 0;
+        enum bp_type_idx type;
+        int weight;
-        mutex_lock(&nr_bp_mutex);
+        /* We couldn't initialize breakpoint constraints on boot */
+        if (!constraints_initialized)
+                return -ENOMEM;
-        fetch_bp_busy_slots(&slots, bp);
+        /* Basic checks */
+        if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+            bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+                return -EINVAL;
+        type = find_slot_idx(bp);
+        weight = hw_breakpoint_weight(bp);
+        fetch_bp_busy_slots(&slots, bp, type);
+        fetch_this_slot(&slots, weight);
        /* Flexible counters need to keep at least one slot */
-        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+        if (slots.pinned + (!!slots.flexible) > nr_slots[type])
-                ret = -ENOSPC;
+                return -ENOSPC;
-                goto end;
-        }
-        toggle_bp_slot(bp, true);
+        toggle_bp_slot(bp, true, type, weight);
+        return 0;
+}
+int reserve_bp_slot(struct perf_event *bp)
+{
+        int ret;
+        mutex_lock(&nr_bp_mutex);
+        ret = __reserve_bp_slot(bp);
-end:
        mutex_unlock(&nr_bp_mutex);
        return ret;
 }
+static void __release_bp_slot(struct perf_event *bp)
+{
+        enum bp_type_idx type;
+        int weight;
+        type = find_slot_idx(bp);
+        weight = hw_breakpoint_weight(bp);
+        toggle_bp_slot(bp, false, type, weight);
+}
 void release_bp_slot(struct perf_event *bp)
 {
        mutex_lock(&nr_bp_mutex);
-        toggle_bp_slot(bp, false);
+        __release_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
 }
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        return __reserve_bp_slot(bp);
+}
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        __release_bp_slot(bp);
+        return 0;
+}
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+        int ret;
+        ret = arch_validate_hwbkpt_settings(bp);
+        if (ret)
+                return ret;
+        if (arch_check_bp_in_kernelspace(bp)) {
+                if (bp->attr.exclude_kernel)
+                        return -EINVAL;
+                /*
+                 * Don't let unprivileged users set a breakpoint in the trap
+                 * path to avoid trap recursion attacks.
+                 */
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+        }
+        return 0;
+}
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
@@ -284,17 +397,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
        if (ret)
                return ret;
-        /*
+        ret = validate_hw_breakpoint(bp);
-         * Ptrace breakpoints can be temporary perf events only
-         * meant to reserve a slot. In this case, it is created disabled and
+        /* if arch_validate_hwbkpt_settings() fails then release bp slot */
-         * we don't want to check the params right now (as we put a null addr)
+        if (ret)
-         * But perf tools create events as disabled and we want to check
+                release_bp_slot(bp);
-         * the params for them.
-         * This is a quick hack that will be removed soon, once we remove
-         * the tmp breakpoints from ptrace
-         */
-        if (!bp->attr.disabled || !bp->overflow_handler)
-                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
        return ret;
 }
@@ -324,8 +431,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
 {
        u64 old_addr = bp->attr.bp_addr;
+        u64 old_len = bp->attr.bp_len;
        int old_type = bp->attr.bp_type;
-        int old_len = bp->attr.bp_len;
        int err = 0;
        perf_event_disable(bp);
@@ -337,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
        if (attr->disabled)
                goto end;
-        err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        err = validate_hw_breakpoint(bp);
        if (!err)
                perf_event_enable(bp);
@@ -377,17 +484,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 *
 * @return a set of per_cpu pointers to perf events
 */
-struct perf_event **
+struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered)
 {
-        struct perf_event **cpu_events, **pevent, *bp;
+        struct perf_event * __percpu *cpu_events, **pevent, *bp;
        long err;
        int cpu;
        cpu_events = alloc_percpu(typeof(*cpu_events));
        if (!cpu_events)
-                return ERR_PTR(-ENOMEM);
+                return (void __percpu __force *)ERR_PTR(-ENOMEM);
        get_online_cpus();
        for_each_online_cpu(cpu) {
@@ -415,7 +522,7 @@ fail:
        put_online_cpus();
        free_percpu(cpu_events);
-        return ERR_PTR(err);
+        return (void __percpu __force *)ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -423,7 +530,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
 * @cpu_events: the per cpu set of events to unregister
 */
-void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
 {
        int cpu;
        struct perf_event **pevent;
@@ -444,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 static int __init init_hw_breakpoint(void)
 {
+        unsigned int **task_bp_pinned;
+        int cpu, err_cpu;
+        int i;
+        for (i = 0; i < TYPE_MAX; i++)
+                nr_slots[i] = hw_breakpoint_slots(i);
+        for_each_possible_cpu(cpu) {
+                for (i = 0; i < TYPE_MAX; i++) {
+                        task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+                        *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
+                                                  GFP_KERNEL);
+                        if (!*task_bp_pinned)
+                                goto err_alloc;
+                }
+        }
+        constraints_initialized = 1;
        return register_die_notifier(&hw_breakpoint_exceptions_nb);
+ err_alloc:
+        for_each_possible_cpu(err_cpu) {
+                if (err_cpu == cpu)
+                        break;
+                for (i = 0; i < TYPE_MAX; i++)
+                        kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+        }
+        return -ENOMEM;
 }
 core_initcall(init_hw_breakpoint);
@@ -453,5 +589,4 @@ struct pmu perf_ops_bp = {
        .enable         = arch_install_hw_breakpoint,
        .disable        = arch_uninstall_hw_breakpoint,
        .read           = hw_breakpoint_pmu_read,
-        .unthrottle     = hw_breakpoint_pmu_unthrottle
 };
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
 #include "internals.h"
-/**
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
- *      dynamic_irq_init - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
 {
        struct irq_desc *desc;
        unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
        desc->depth = 1;
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->action = NULL;
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
 }
 /**
- *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      dynamic_irq_init - initialize a dynamically allocated irq
 *      @irq:   irq number to initialize
 */
-void dynamic_irq_cleanup(unsigned int irq)
+void dynamic_irq_init(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, false);
+}
+/**
+ *      dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, true);
+}
+static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
        }
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->handle_irq = handle_bad_irq;
        desc->chip = &no_irq_chip;
        desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
+/**
+ *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, false);
+}
+/**
+ *      dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, true);
+}
 /**
 *      set_irq_chip - set the irq chip for an irq
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
                if (desc->chip->ack)
                        desc->chip->ack(irq);
        }
+        desc->status |= IRQ_MASKED;
+}
+static inline void mask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->mask) {
+                desc->chip->mask(irq);
+                desc->status |= IRQ_MASKED;
+        }
+}
+static inline void unmask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->unmask) {
+                desc->chip->unmask(irq);
+                desc->status &= ~IRQ_MASKED;
+        }
 }
 /*
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
-        if (unlikely(desc->status & IRQ_ONESHOT))
+        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
-                desc->status |= IRQ_MASKED;
+                unmask_irq(desc, irq);
-        else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-                desc->chip->unmask(irq);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        action = desc->action;
        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                desc->status |= IRQ_PENDING;
-                if (desc->chip->mask)
+                mask_irq(desc, irq);
-                        desc->chip->mask(irq);
                goto out;
        }
@@ -520,7 +568,7 @@ out:
 *      signal. The occurence is latched into the irq controller hardware
 *      and must be acked in order to be reenabled. After the ack another
 *      interrupt can happen on the same source even before the first one
- *      is handled by the assosiacted event handler. If this happens it
+ *      is handled by the associated event handler. If this happens it
 *      might be necessary to disable (mask) the interrupt depending on the
 *      controller hardware. This requires to reenable the interrupt inside
 *      of the loop which handles the interrupts which have arrived while
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                irqreturn_t action_ret;
                if (unlikely(!action)) {
-                        desc->chip->mask(irq);
+                        mask_irq(desc, irq);
                        goto out_unlock;
                }
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                if (unlikely((desc->status &
                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
                              (IRQ_PENDING | IRQ_MASKED))) {
-                        desc->chip->unmask(irq);
+                        unmask_irq(desc, irq);
-                        desc->status &= ~IRQ_MASKED;
                }
                desc->status &= ~IRQ_PENDING;
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        __set_irq_handler(irq, handle, 0, name);
 }
-void __init set_irq_noprobe(unsigned int irq)
+void set_irq_noprobe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
-void __init set_irq_probe(unsigned int irq)
+void set_irq_probe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 *      automatically freed on driver detach.
 *
 *      If an IRQ allocated with this function needs to be freed
- *      separately, dev_free_irq() must be used.
+ *      separately, devm_free_irq() must be used.
 */
 int devm_request_threaded_irq(struct device *dev, unsigned int irq,
                              irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
 *      Except for the extra @dev argument, this function takes the
 *      same arguments and performs the same function as free_irq().
 *      This function instead of free_irq() should be used to manually
- *      free IRQs allocated with dev_request_irq().
+ *      free IRQs allocated with devm_request_irq().
 */
 void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 #include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
        void *ptr;
-        if (slab_is_available())
+        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                           GFP_ATOMIC, node);
-                                   GFP_ATOMIC, node);
-        else
-                ptr = alloc_bootmem_node(NODE_DATA(node),
-                                nr * sizeof(*desc->kstat_irqs));
        /*
         * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 */
 DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-struct irq_desc **irq_desc_ptrs __read_mostly;
+static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
+static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return radix_tree_lookup(&irq_desc_tree, irq);
+}
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        void **ptr;
+        ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
+        if (ptr)
+                radix_tree_replace_slot(ptr, desc);
+}
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
        legacy_count = ARRAY_SIZE(irq_desc_legacy);
        node = first_online_node;
-        /* allocate irq_desc_ptrs array based on nr_irqs */
-        irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
        /* allocate based on nr_cpu_ids */
        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
                                          sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
                alloc_desc_masks(&desc[i], node, true);
                init_desc_masks(&desc[i]);
-                irq_desc_ptrs[i] = desc + i;
+                set_irq_desc(i, &desc[i]);
        }
-        for (i = legacy_count; i < nr_irqs; i++)
-                irq_desc_ptrs[i] = NULL;
        return arch_early_irq_init();
 }
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        if (irq_desc_ptrs && irq < nr_irqs)
-                return irq_desc_ptrs[irq];
-        return NULL;
-}
 struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
                return NULL;
        }
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                return desc;
        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                goto out_unlock;
-        if (slab_is_available())
+        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-                desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        else
-                desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
        if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
        }
        init_one_irq_desc(irq, desc, node);
-        irq_desc_ptrs[irq] = desc;
+        set_irq_desc(irq, desc);
 out_unlock:
        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
@@ -372,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
        irqreturn_t ret, retval = IRQ_NONE;
        unsigned int status = 0;
-        if (!(action->flags & IRQF_DISABLED))
-                local_irq_enable_in_hardirq();
        do {
                trace_irq_handler_entry(irq, action);
                ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
 extern raw_spinlock_t sparse_irq_lock;
 #ifdef CONFIG_SPARSE_IRQ
-/* irq_desc_ptrs allocated at boot time */
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
-extern struct irq_desc **irq_desc_ptrs;
-#else
-/* irq_desc_ptrs is a fixed size array */
-extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
 #endif
 #ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..e1497481fe8a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        return 0;
 }
+int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        unsigned long flags;
+        if (!desc)
+                return -EINVAL;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        desc->affinity_hint = m;
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
 * Generic version of the affinity autoselector.
@@ -382,6 +398,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action;
+        unsigned long flags;
        if (!desc)
                return 0;
@@ -389,11 +406,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
        if (desc->status & IRQ_NOREQUEST)
                return 0;
+        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        if (action)
                if (irqflags & action->flags & IRQF_SHARED)
                        action = NULL;
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return !action;
 }
@@ -436,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
                desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
                desc->status |= flags;
+                if (chip != desc->chip)
+                        irq_chip_set_defaults(desc->chip);
        }
        return ret;
@@ -483,8 +506,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+again:
        chip_bus_lock(irq, desc);
        raw_spin_lock_irq(&desc->lock);
+        /*
+         * Implausible though it may be we need to protect us against
+         * the following scenario:
+         *
+         * The thread is faster done than the hard interrupt handler
+         * on the other CPU. If we unmask the irq line then the
+         * interrupt can come in again and masks the line, leaves due
+         * to IRQ_INPROGRESS and the irq line is masked forever.
+         */
+        if (unlikely(desc->status & IRQ_INPROGRESS)) {
+                raw_spin_unlock_irq(&desc->lock);
+                chip_bus_sync_unlock(irq, desc);
+                cpu_relax();
+                goto again;
+        }
        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                desc->status &= ~IRQ_MASKED;
                desc->chip->unmask(irq);
@@ -884,6 +925,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                        desc->chip->disable(irq);
        }
+#ifdef CONFIG_SMP
+        /* make sure affinity_hint is cleaned up */
+        if (WARN_ON_ONCE(desc->affinity_hint))
+                desc->affinity_hint = NULL;
+#endif
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        unregister_handler_proc(irq, action);
@@ -995,7 +1042,6 @@ EXPORT_SYMBOL(free_irq);
 *      Flags:
 *
 *      IRQF_SHARED             Interrupt is shared
- *      IRQF_DISABLED   Disable local interrupts while processing
 *      IRQF_SAMPLE_RANDOM      The interrupt can be used for entropy
 *      IRQF_TRIGGER_*          Specify active edge(s) or level
 *
@@ -1009,25 +1055,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        int retval;
        /*
-         * handle_IRQ_event() always ignores IRQF_DISABLED except for
-         * the _first_ irqaction (sigh).  That can cause oopsing, but
-         * the behavior is classified as "will not fix" so we need to
-         * start nudging drivers away from using that idiom.
-         */
-        if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
-                                        (IRQF_SHARED|IRQF_DISABLED)) {
-                pr_warning(
-                  "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
-                        irq, devname);
-        }
-#ifdef CONFIG_LOCKDEP
-        /*
-         * Lockdep wants atomic interrupt handlers:
-         */
-        irqflags |= IRQF_DISABLED;
-#endif
-        /*
         * Sanity-check: shared interrupts must pass in a real dev-ID,
         * otherwise we'll have trouble later trying to figure out
         * which interrupt is which (messes up the interrupt freeing
@@ -1088,3 +1115,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        return retval;
 }
 EXPORT_SYMBOL(request_threaded_irq);
+/**
+ *      request_any_context_irq - allocate an interrupt line
+ *      @irq: Interrupt line to allocate
+ *      @handler: Function to be called when the IRQ occurs.
+ *                Threaded handler for threaded interrupts.
+ *      @flags: Interrupt type flags
+ *      @name: An ascii name for the claiming device
+ *      @dev_id: A cookie passed back to the handler function
+ *
+ *      This call allocates interrupt resources and enables the
+ *      interrupt line and IRQ handling. It selects either a
+ *      hardirq or threaded handling method depending on the
+ *      context.
+ *
+ *      On failure, it returns a negative value. On success,
+ *      it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ */
+int request_any_context_irq(unsigned int irq, irq_handler_t handler,
+                            unsigned long flags, const char *name, void *dev_id)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        int ret;
+        if (!desc)
+                return -EINVAL;
+        if (desc->status & IRQ_NESTED_THREAD) {
+                ret = request_threaded_irq(irq, NULL, handler,
+                                           flags, name, dev_id);
+                return !ret ? IRQC_IS_NESTED : ret;
+        }
+        ret = request_irq(irq, handler, flags, name, dev_id);
+        return !ret ? IRQC_IS_HARDIRQ : ret;
+}
+EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
 */
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
@@ -70,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc && old_desc != desc)
                goto out_unlock;
@@ -90,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                goto out_unlock;
        }
-        irq_desc_ptrs[irq] = desc;
+        replace_irq_desc(irq, desc);
        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        /* free the old one */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/irq.h>
+#include <linux/gfp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
@@ -31,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
        return 0;
 }
+static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
+{
+        struct irq_desc *desc = irq_to_desc((long)m->private);
+        unsigned long flags;
+        cpumask_var_t mask;
+        if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+                return -ENOMEM;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        if (desc->affinity_hint)
+                cpumask_copy(mask, desc->affinity_hint);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        seq_cpumask(m, mask);
+        seq_putc(m, '\n');
+        free_cpumask_var(mask);
+        return 0;
+}
 #ifndef is_affinity_mask_valid
 #define is_affinity_mask_valid(val) 1
 #endif
@@ -83,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
        return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
+static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+}
 static const struct file_operations irq_affinity_proc_fops = {
        .open           = irq_affinity_proc_open,
        .read           = seq_read,
@@ -91,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
        .write          = irq_affinity_proc_write,
 };
+static const struct file_operations irq_affinity_hint_proc_fops = {
+        .open           = irq_affinity_hint_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 static int default_affinity_show(struct seq_file *m, void *v)
 {
        seq_cpumask(m, irq_default_affinity);
@@ -146,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
        .release        = single_release,
        .write          = default_affinity_write,
 };
+static int irq_node_proc_show(struct seq_file *m, void *v)
+{
+        struct irq_desc *desc = irq_to_desc((long) m->private);
+        seq_printf(m, "%d\n", desc->node);
+        return 0;
+}
+static int irq_node_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, irq_node_proc_show, PDE(inode)->data);
+}
+static const struct file_operations irq_node_proc_fops = {
+        .open           = irq_node_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #endif
 static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -230,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
        /* create /proc/irq/<irq>/smp_affinity */
        proc_create_data("smp_affinity", 0600, desc->dir,
                         &irq_affinity_proc_fops, (void *)(long)irq);
+        /* create /proc/irq/<irq>/affinity_hint */
+        proc_create_data("affinity_hint", 0400, desc->dir,
+                         &irq_affinity_hint_proc_fops, (void *)(long)irq);
+        proc_create_data("node", 0444, desc->dir,
+                         &irq_node_proc_fops, (void *)(long)irq);
 #endif
        proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a355..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,11 +16,13 @@
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/fs.h>
+#include <linux/kdb.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>        /* for cond_resched */
 #include <linux/mm.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <asm/sections.h>
@@ -515,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
        return ret;
 }
+#ifdef  CONFIG_KGDB_KDB
+const char *kdb_walk_kallsyms(loff_t *pos)
+{
+        static struct kallsym_iter kdb_walk_kallsyms_iter;
+        if (*pos == 0) {
+                memset(&kdb_walk_kallsyms_iter, 0,
+                       sizeof(kdb_walk_kallsyms_iter));
+                reset_iter(&kdb_walk_kallsyms_iter, 0);
+        }
+        while (1) {
+                if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
+                        return NULL;
+                ++*pos;
+                /* Some debugging symbols have no name.  Ignore them. */
+                if (kdb_walk_kallsyms_iter.name[0])
+                        return kdb_walk_kallsyms_iter.name;
+        }
+}
+#endif  /* CONFIG_KGDB_KDB */
 static const struct file_operations kallsyms_operations = {
        .open = kallsyms_open,
        .read = seq_read,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..131b1703936f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
 #include <asm/sections.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t* crash_notes;
+note_buf_t __percpu *crash_notes;
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs)
 size_t crash_get_memory_size(void)
 {
-        size_t size;
+        size_t size = 0;
        mutex_lock(&kexec_mutex);
-        size = crashk_res.end - crashk_res.start + 1;
+        if (crashk_res.end != crashk_res.start)
+                size = crashk_res.end - crashk_res.start + 1;
        mutex_unlock(&kexec_mutex);
        return size;
 }
@@ -1134,11 +1135,9 @@ int crash_shrink_memory(unsigned long new_size)
        free_reserved_phys_range(end, crashk_res.end);
-        if (start == end) {
+        if ((start == end) && (crashk_res.parent != NULL))
-                crashk_res.end = end;
                release_resource(&crashk_res);
-        } else
+        crashk_res.end = end - 1;
-                crashk_res.end = end - 1;
 unlock:
        mutex_unlock(&kexec_mutex);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 32c5c15d750d..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
        buffer = kmalloc(size, gfp_mask);
        if (!buffer) {
-                _kfifo_init(fifo, 0, 0);
+                _kfifo_init(fifo, NULL, 0);
                return -ENOMEM;
        }
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
 void kfifo_free(struct kfifo *fifo)
 {
        kfree(fifo->buffer);
+        _kfifo_init(fifo, NULL, 0);
 }
 EXPORT_SYMBOL(kfifo_free);
@@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n);
 * @fifo: the fifo to be used.
 * @from: pointer to the data to be added.
 * @len: the length of the data to be added.
+ * @total: the actual returned data length.
 *
 * This function copies at most @len bytes from the @from into the
 * FIFO depending and returns -EFAULT/0.
@@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n);
 * @fifo: the fifo to be used.
 * @to: where the data must be copied.
 * @len: the size of the destination buffer.
- @ @lenout: pointer to output variable with copied data
+ * @lenout: pointer to output variable with copied data
 *
 * This function copies at most @len bytes from the FIFO into the
 * @to buffer and 0 or -EFAULT.
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 2eb517e23514..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1760 +0,0 @@
-/*
- * KGDB stub.
- *
- * Maintainer: Jason Wessel <jason.wessel@windriver.com>
- *
- * Copyright (C) 2000-2001 VERITAS Software Corporation.
- * Copyright (C) 2002-2004 Timesys Corporation
- * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
- * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
- * Copyright (C) 2005-2008 Wind River Systems, Inc.
- * Copyright (C) 2007 MontaVista Software, Inc.
- * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- * Contributors at various stages not listed above:
- *  Jason Wessel ( jason.wessel@windriver.com )
- *  George Anzinger <george@mvista.com>
- *  Anurekh Saxena (anurekh.saxena@timesys.com)
- *  Lake Stevens Instrument Division (Glenn Engel)
- *  Jim Kingdon, Cygnus Support.
- *
- * Original KGDB stub: David Grothe <dave@gcom.com>,
- * Tigran Aivazian <tigran@sco.com>
- *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
- */
-#include <linux/pid_namespace.h>
-#include <linux/clocksource.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/console.h>
-#include <linux/threads.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ptrace.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/sysrq.h>
-#include <linux/init.h>
-#include <linux/kgdb.h>
-#include <linux/pid.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-#include <asm/cacheflush.h>
-#include <asm/byteorder.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/unaligned.h>
-static int kgdb_break_asap;
-#define KGDB_MAX_THREAD_QUERY 17
-struct kgdb_state {
-        int                     ex_vector;
-        int                     signo;
-        int                     err_code;
-        int                     cpu;
-        int                     pass_exception;
-        unsigned long           thr_query;
-        unsigned long           threadid;
-        long                    kgdb_usethreadid;
-        struct pt_regs          *linux_regs;
-};
-static struct debuggerinfo_struct {
-        void                    *debuggerinfo;
-        struct task_struct      *task;
-} kgdb_info[NR_CPUS];
-/**
- * kgdb_connected - Is a host GDB connected to us?
- */
-int                             kgdb_connected;
-EXPORT_SYMBOL_GPL(kgdb_connected);
-/* All the KGDB handlers are installed */
-static int                      kgdb_io_module_registered;
-/* Guard for recursive entry */
-static int                      exception_level;
-static struct kgdb_io           *kgdb_io_ops;
-static DEFINE_SPINLOCK(kgdb_registration_lock);
-/* kgdb console driver is loaded */
-static int kgdb_con_registered;
-/* determine if kgdb console output should be used */
-static int kgdb_use_con;
-static int __init opt_kgdb_con(char *str)
-{
-        kgdb_use_con = 1;
-        return 0;
-}
-early_param("kgdbcon", opt_kgdb_con);
-module_param(kgdb_use_con, int, 0644);
-/*
- * Holds information about breakpoints in a kernel. These breakpoints are
- * added and removed by gdb.
- */
-static struct kgdb_bkpt         kgdb_break[KGDB_MAX_BREAKPOINTS] = {
-        [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
-};
-/*
- * The CPU# of the active CPU, or -1 if none:
- */
-atomic_t                        kgdb_active = ATOMIC_INIT(-1);
-/*
- * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
- * bootup code (which might not have percpu set up yet):
- */
-static atomic_t                 passive_cpu_wait[NR_CPUS];
-static atomic_t                 cpu_in_kgdb[NR_CPUS];
-atomic_t                        kgdb_setting_breakpoint;
-struct task_struct              *kgdb_usethread;
-struct task_struct              *kgdb_contthread;
-int                             kgdb_single_step;
-pid_t                           kgdb_sstep_pid;
-/* Our I/O buffers. */
-static char                     remcom_in_buffer[BUFMAX];
-static char                     remcom_out_buffer[BUFMAX];
-/* Storage for the registers, in GDB format. */
-static unsigned long            gdb_regs[(NUMREGBYTES +
-                                        sizeof(unsigned long) - 1) /
-                                        sizeof(unsigned long)];
-/* to keep track of the CPU which is doing the single stepping*/
-atomic_t                        kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
-/*
- * If you are debugging a problem where roundup (the collection of
- * all other CPUs) is a problem [this should be extremely rare],
- * then use the nokgdbroundup option to avoid roundup. In that case
- * the other CPUs might interfere with your debugging context, so
- * use this with care:
- */
-static int kgdb_do_roundup = 1;
-static int __init opt_nokgdbroundup(char *str)
-{
-        kgdb_do_roundup = 0;
-        return 0;
-}
-early_param("nokgdbroundup", opt_nokgdbroundup);
-/*
- * Finally, some KGDB code :-)
- */
-/*
- * Weak aliases for breakpoint management,
- * can be overriden by architectures when needed:
- */
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
-{
-        int err;
-        err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
-        if (err)
-                return err;
-        return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
-                                  BREAK_INSTR_SIZE);
-}
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
-{
-        return probe_kernel_write((char *)addr,
-                                  (char *)bundle, BREAK_INSTR_SIZE);
-}
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-        char tmp_variable[BREAK_INSTR_SIZE];
-        int err;
-        /* Validate setting the breakpoint and then removing it.  In the
-         * remove fails, the kernel needs to emit a bad message because we
-         * are deep trouble not being able to put things back the way we
-         * found them.
-         */
-        err = kgdb_arch_set_breakpoint(addr, tmp_variable);
-        if (err)
-                return err;
-        err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
-        if (err)
-                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
-                   "memory destroyed at: %lx", addr);
-        return err;
-}
-unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
-{
-        return instruction_pointer(regs);
-}
-int __weak kgdb_arch_init(void)
-{
-        return 0;
-}
-int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
-{
-        return 0;
-}
-void __weak
-kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
-        return;
-}
-/**
- *      kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- *      @regs: Current &struct pt_regs.
- *
- *      This function will be called if the particular architecture must
- *      disable hardware debugging while it is processing gdb packets or
- *      handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
-/*
- * GDB remote protocol parser:
- */
-static int hex(char ch)
-{
-        if ((ch >= 'a') && (ch <= 'f'))
-                return ch - 'a' + 10;
-        if ((ch >= '0') && (ch <= '9'))
-                return ch - '0';
-        if ((ch >= 'A') && (ch <= 'F'))
-                return ch - 'A' + 10;
-        return -1;
-}
-/* scan for the sequence $<data>#<checksum> */
-static void get_packet(char *buffer)
-{
-        unsigned char checksum;
-        unsigned char xmitcsum;
-        int count;
-        char ch;
-        do {
-                /*
-                 * Spin and wait around for the start character, ignore all
-                 * other characters:
-                 */
-                while ((ch = (kgdb_io_ops->read_char())) != '$')
-                        /* nothing */;
-                kgdb_connected = 1;
-                checksum = 0;
-                xmitcsum = -1;
-                count = 0;
-                /*
-                 * now, read until a # or end of buffer is found:
-                 */
-                while (count < (BUFMAX - 1)) {
-                        ch = kgdb_io_ops->read_char();
-                        if (ch == '#')
-                                break;
-                        checksum = checksum + ch;
-                        buffer[count] = ch;
-                        count = count + 1;
-                }
-                buffer[count] = 0;
-                if (ch == '#') {
-                        xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
-                        xmitcsum += hex(kgdb_io_ops->read_char());
-                        if (checksum != xmitcsum)
-                                /* failed checksum */
-                                kgdb_io_ops->write_char('-');
-                        else
-                                /* successful transfer */
-                                kgdb_io_ops->write_char('+');
-                        if (kgdb_io_ops->flush)
-                                kgdb_io_ops->flush();
-                }
-        } while (checksum != xmitcsum);
-}
-/*
- * Send the packet in buffer.
- * Check for gdb connection if asked for.
- */
-static void put_packet(char *buffer)
-{
-        unsigned char checksum;
-        int count;
-        char ch;
-        /*
-         * $<packet info>#<checksum>.
-         */
-        while (1) {
-                kgdb_io_ops->write_char('$');
-                checksum = 0;
-                count = 0;
-                while ((ch = buffer[count])) {
-                        kgdb_io_ops->write_char(ch);
-                        checksum += ch;
-                        count++;
-                }
-                kgdb_io_ops->write_char('#');
-                kgdb_io_ops->write_char(hex_asc_hi(checksum));
-                kgdb_io_ops->write_char(hex_asc_lo(checksum));
-                if (kgdb_io_ops->flush)
-                        kgdb_io_ops->flush();
-                /* Now see what we get in reply. */
-                ch = kgdb_io_ops->read_char();
-                if (ch == 3)
-                        ch = kgdb_io_ops->read_char();
-                /* If we get an ACK, we are done. */
-                if (ch == '+')
-                        return;
-                /*
-                 * If we get the start of another packet, this means
-                 * that GDB is attempting to reconnect.  We will NAK
-                 * the packet being sent, and stop trying to send this
-                 * packet.
-                 */
-                if (ch == '$') {
-                        kgdb_io_ops->write_char('-');
-                        if (kgdb_io_ops->flush)
-                                kgdb_io_ops->flush();
-                        return;
-                }
-        }
-}
-/*
- * Convert the memory pointed to by mem into hex, placing result in buf.
- * Return a pointer to the last char put in buf (null). May return an error.
- */
-int kgdb_mem2hex(char *mem, char *buf, int count)
-{
-        char *tmp;
-        int err;
-        /*
-         * We use the upper half of buf as an intermediate buffer for the
-         * raw memory copy.  Hex conversion will work against this one.
-         */
-        tmp = buf + count;
-        err = probe_kernel_read(tmp, mem, count);
-        if (!err) {
-                while (count > 0) {
-                        buf = pack_hex_byte(buf, *tmp);
-                        tmp++;
-                        count--;
-                }
-                *buf = 0;
-        }
-        return err;
-}
-/*
- * Copy the binary array pointed to by buf into mem.  Fix $, #, and
- * 0x7d escaped with 0x7d.  Return a pointer to the character after
- * the last byte written.
- */
-static int kgdb_ebin2mem(char *buf, char *mem, int count)
-{
-        int err = 0;
-        char c;
-        while (count-- > 0) {
-                c = *buf++;
-                if (c == 0x7d)
-                        c = *buf++ ^ 0x20;
-                err = probe_kernel_write(mem, &c, 1);
-                if (err)
-                        break;
-                mem++;
-        }
-        return err;
-}
-/*
- * Convert the hex array pointed to by buf into binary to be placed in mem.
- * Return a pointer to the character AFTER the last byte written.
- * May return an error.
- */
-int kgdb_hex2mem(char *buf, char *mem, int count)
-{
-        char *tmp_raw;
-        char *tmp_hex;
-        /*
-         * We use the upper half of buf as an intermediate buffer for the
-         * raw memory that is converted from hex.
-         */
-        tmp_raw = buf + count * 2;
-        tmp_hex = tmp_raw - 1;
-        while (tmp_hex >= buf) {
-                tmp_raw--;
-                *tmp_raw = hex(*tmp_hex--);
-                *tmp_raw |= hex(*tmp_hex--) << 4;
-        }
-        return probe_kernel_write(mem, tmp_raw, count);
-}
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-int kgdb_hex2long(char **ptr, unsigned long *long_val)
-{
-        int hex_val;
-        int num = 0;
-        int negate = 0;
-        *long_val = 0;
-        if (**ptr == '-') {
-                negate = 1;
-                (*ptr)++;
-        }
-        while (**ptr) {
-                hex_val = hex(**ptr);
-                if (hex_val < 0)
-                        break;
-                *long_val = (*long_val << 4) | hex_val;
-                num++;
-                (*ptr)++;
-        }
-        if (negate)
-                *long_val = -*long_val;
-        return num;
-}
-/* Write memory due to an 'M' or 'X' packet. */
-static int write_mem_msg(int binary)
-{
-        char *ptr = &remcom_in_buffer[1];
-        unsigned long addr;
-        unsigned long length;
-        int err;
-        if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
-            kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
-                if (binary)
-                        err = kgdb_ebin2mem(ptr, (char *)addr, length);
-                else
-                        err = kgdb_hex2mem(ptr, (char *)addr, length);
-                if (err)
-                        return err;
-                if (CACHE_FLUSH_IS_SAFE)
-                        flush_icache_range(addr, addr + length);
-                return 0;
-        }
-        return -EINVAL;
-}
-static void error_packet(char *pkt, int error)
-{
-        error = -error;
-        pkt[0] = 'E';
-        pkt[1] = hex_asc[(error / 10)];
-        pkt[2] = hex_asc[(error % 10)];
-        pkt[3] = '\0';
-}
-/*
- * Thread ID accessors. We represent a flat TID space to GDB, where
- * the per CPU idle threads (which under Linux all have PID 0) are
- * remapped to negative TIDs.
- */
-#define BUF_THREAD_ID_SIZE      16
-static char *pack_threadid(char *pkt, unsigned char *id)
-{
-        char *limit;
-        limit = pkt + BUF_THREAD_ID_SIZE;
-        while (pkt < limit)
-                pkt = pack_hex_byte(pkt, *id++);
-        return pkt;
-}
-static void int_to_threadref(unsigned char *id, int value)
-{
-        unsigned char *scan;
-        int i = 4;
-        scan = (unsigned char *)id;
-        while (i--)
-                *scan++ = 0;
-        put_unaligned_be32(value, scan);
-}
-static struct task_struct *getthread(struct pt_regs *regs, int tid)
-{
-        /*
-         * Non-positive TIDs are remapped to the cpu shadow information
-         */
-        if (tid == 0 || tid == -1)
-                tid = -atomic_read(&kgdb_active) - 2;
-        if (tid < -1 && tid > -NR_CPUS - 2) {
-                if (kgdb_info[-tid - 2].task)
-                        return kgdb_info[-tid - 2].task;
-                else
-                        return idle_task(-tid - 2);
-        }
-        if (tid <= 0) {
-                printk(KERN_ERR "KGDB: Internal thread select error\n");
-                dump_stack();
-                return NULL;
-        }
-        /*
-         * find_task_by_pid_ns() does not take the tasklist lock anymore
-         * but is nicely RCU locked - hence is a pretty resilient
-         * thing to use:
-         */
-        return find_task_by_pid_ns(tid, &init_pid_ns);
-}
-/*
- * CPU debug state control:
- */
-#ifdef CONFIG_SMP
-static void kgdb_wait(struct pt_regs *regs)
-{
-        unsigned long flags;
-        int cpu;
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        kgdb_info[cpu].debuggerinfo = regs;
-        kgdb_info[cpu].task = current;
-        /*
-         * Make sure the above info reaches the primary CPU before
-         * our cpu_in_kgdb[] flag setting does:
-         */
-        smp_wmb();
-        atomic_set(&cpu_in_kgdb[cpu], 1);
-        /* Wait till primary CPU is done with debugging */
-        while (atomic_read(&passive_cpu_wait[cpu]))
-                cpu_relax();
-        kgdb_info[cpu].debuggerinfo = NULL;
-        kgdb_info[cpu].task = NULL;
-        /* fix up hardware debug registers on local cpu */
-        if (arch_kgdb_ops.correct_hw_break)
-                arch_kgdb_ops.correct_hw_break();
-        /* Signal the primary CPU that we are done: */
-        atomic_set(&cpu_in_kgdb[cpu], 0);
-        touch_softlockup_watchdog();
-        clocksource_touch_watchdog();
-        local_irq_restore(flags);
-}
-#endif
-/*
- * Some architectures need cache flushes when we set/clear a
- * breakpoint:
- */
-static void kgdb_flush_swbreak_addr(unsigned long addr)
-{
-        if (!CACHE_FLUSH_IS_SAFE)
-                return;
-        if (current->mm && current->mm->mmap_cache) {
-                flush_cache_range(current->mm->mmap_cache,
-                                  addr, addr + BREAK_INSTR_SIZE);
-        }
-        /* Force flush instruction cache if it was outside the mm */
-        flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
-}
-/*
- * SW breakpoint management:
- */
-static int kgdb_activate_sw_breakpoints(void)
-{
-        unsigned long addr;
-        int error;
-        int ret = 0;
-        int i;
-        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                if (kgdb_break[i].state != BP_SET)
-                        continue;
-                addr = kgdb_break[i].bpt_addr;
-                error = kgdb_arch_set_breakpoint(addr,
-                                kgdb_break[i].saved_instr);
-                if (error) {
-                        ret = error;
-                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
-                        continue;
-                }
-                kgdb_flush_swbreak_addr(addr);
-                kgdb_break[i].state = BP_ACTIVE;
-        }
-        return ret;
-}
-static int kgdb_set_sw_break(unsigned long addr)
-{
-        int err = kgdb_validate_break_address(addr);
-        int breakno = -1;
-        int i;
-        if (err)
-                return err;
-        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                if ((kgdb_break[i].state == BP_SET) &&
-                                        (kgdb_break[i].bpt_addr == addr))
-                        return -EEXIST;
-        }
-        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                if (kgdb_break[i].state == BP_REMOVED &&
-                                        kgdb_break[i].bpt_addr == addr) {
-                        breakno = i;
-                        break;
-                }
-        }
-        if (breakno == -1) {
-                for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                        if (kgdb_break[i].state == BP_UNDEFINED) {
-                                breakno = i;
-                                break;
-                        }
-                }
-        }
-        if (breakno == -1)
-                return -E2BIG;
-        kgdb_break[breakno].state = BP_SET;
-        kgdb_break[breakno].type = BP_BREAKPOINT;
-        kgdb_break[breakno].bpt_addr = addr;
-        return 0;
-}
-static int kgdb_deactivate_sw_breakpoints(void)
-{
-        unsigned long addr;
-        int error;
-        int ret = 0;
-        int i;
-        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                if (kgdb_break[i].state != BP_ACTIVE)
-                        continue;
-                addr = kgdb_break[i].bpt_addr;
-                error = kgdb_arch_remove_breakpoint(addr,
-                                        kgdb_break[i].saved_instr);
-                if (error) {
-                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
-                        ret = error;
-                }
-                kgdb_flush_swbreak_addr(addr);
-                kgdb_break[i].state = BP_SET;
-        }
-        return ret;
-}
-static int kgdb_remove_sw_break(unsigned long addr)
-{
-        int i;
-        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                if ((kgdb_break[i].state == BP_SET) &&
-                                (kgdb_break[i].bpt_addr == addr)) {
-                        kgdb_break[i].state = BP_REMOVED;
-                        return 0;
-                }
-        }
-        return -ENOENT;
-}
-int kgdb_isremovedbreak(unsigned long addr)
-{
-        int i;
-        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                if ((kgdb_break[i].state == BP_REMOVED) &&
-                                        (kgdb_break[i].bpt_addr == addr))
-                        return 1;
-        }
-        return 0;
-}
-static int remove_all_break(void)
-{
-        unsigned long addr;
-        int error;
-        int i;
-        /* Clear memory breakpoints. */
-        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
-                if (kgdb_break[i].state != BP_ACTIVE)
-                        goto setundefined;
-                addr = kgdb_break[i].bpt_addr;
-                error = kgdb_arch_remove_breakpoint(addr,
-                                kgdb_break[i].saved_instr);
-                if (error)
-                        printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
-                           addr);
-setundefined:
-                kgdb_break[i].state = BP_UNDEFINED;
-        }
-        /* Clear hardware breakpoints. */
-        if (arch_kgdb_ops.remove_all_hw_break)
-                arch_kgdb_ops.remove_all_hw_break();
-        return 0;
-}
-/*
- * Remap normal tasks to their real PID,
- * CPU shadow threads are mapped to -CPU - 2
- */
-static inline int shadow_pid(int realpid)
-{
-        if (realpid)
-                return realpid;
-        return -raw_smp_processor_id() - 2;
-}
-static char gdbmsgbuf[BUFMAX + 1];
-static void kgdb_msg_write(const char *s, int len)
-{
-        char *bufptr;
-        int wcount;
-        int i;
-        /* 'O'utput */
-        gdbmsgbuf[0] = 'O';
-        /* Fill and send buffers... */
-        while (len > 0) {
-                bufptr = gdbmsgbuf + 1;
-                /* Calculate how many this time */
-                if ((len << 1) > (BUFMAX - 2))
-                        wcount = (BUFMAX - 2) >> 1;
-                else
-                        wcount = len;
-                /* Pack in hex chars */
-                for (i = 0; i < wcount; i++)
-                        bufptr = pack_hex_byte(bufptr, s[i]);
-                *bufptr = '\0';
-                /* Move up */
-                s += wcount;
-                len -= wcount;
-                /* Write packet */
-                put_packet(gdbmsgbuf);
-        }
-}
-/*
- * Return true if there is a valid kgdb I/O module.  Also if no
- * debugger is attached a message can be printed to the console about
- * waiting for the debugger to attach.
- *
- * The print_wait argument is only to be true when called from inside
- * the core kgdb_handle_exception, because it will wait for the
- * debugger to attach.
- */
-static int kgdb_io_ready(int print_wait)
-{
-        if (!kgdb_io_ops)
-                return 0;
-        if (kgdb_connected)
-                return 1;
-        if (atomic_read(&kgdb_setting_breakpoint))
-                return 1;
-        if (print_wait)
-                printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
-        return 1;
-}
-/*
- * All the functions that start with gdb_cmd are the various
- * operations to implement the handlers for the gdbserial protocol
- * where KGDB is communicating with an external debugger
- */
-/* Handle the '?' status packets */
-static void gdb_cmd_status(struct kgdb_state *ks)
-{
-        /*
-         * We know that this packet is only sent
-         * during initial connect.  So to be safe,
-         * we clear out our breakpoints now in case
-         * GDB is reconnecting.
-         */
-        remove_all_break();
-        remcom_out_buffer[0] = 'S';
-        pack_hex_byte(&remcom_out_buffer[1], ks->signo);
-}
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
-{
-        struct task_struct *thread;
-        void *local_debuggerinfo;
-        int i;
-        thread = kgdb_usethread;
-        if (!thread) {
-                thread = kgdb_info[ks->cpu].task;
-                local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
-        } else {
-                local_debuggerinfo = NULL;
-                for_each_online_cpu(i) {
-                        /*
-                         * Try to find the task on some other
-                         * or possibly this node if we do not
-                         * find the matching task then we try
-                         * to approximate the results.
-                         */
-                        if (thread == kgdb_info[i].task)
-                                local_debuggerinfo = kgdb_info[i].debuggerinfo;
-                }
-        }
-        /*
-         * All threads that don't have debuggerinfo should be
-         * in schedule() sleeping, since all other CPUs
-         * are in kgdb_wait, and thus have debuggerinfo.
-         */
-        if (local_debuggerinfo) {
-                pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
-        } else {
-                /*
-                 * Pull stuff saved during switch_to; nothing
-                 * else is accessible (or even particularly
-                 * relevant).
-                 *
-                 * This should be enough for a stack trace.
-                 */
-                sleeping_thread_to_gdb_regs(gdb_regs, thread);
-        }
-        kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
-}
-/* Handle the 'G' set registers request */
-static void gdb_cmd_setregs(struct kgdb_state *ks)
-{
-        kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
-        if (kgdb_usethread && kgdb_usethread != current) {
-                error_packet(remcom_out_buffer, -EINVAL);
-        } else {
-                gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
-                strcpy(remcom_out_buffer, "OK");
-        }
-}
-/* Handle the 'm' memory read bytes */
-static void gdb_cmd_memread(struct kgdb_state *ks)
-{
-        char *ptr = &remcom_in_buffer[1];
-        unsigned long length;
-        unsigned long addr;
-        int err;
-        if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
-                                        kgdb_hex2long(&ptr, &length) > 0) {
-                err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
-                if (err)
-                        error_packet(remcom_out_buffer, err);
-        } else {
-                error_packet(remcom_out_buffer, -EINVAL);
-        }
-}
-/* Handle the 'M' memory write bytes */
-static void gdb_cmd_memwrite(struct kgdb_state *ks)
-{
-        int err = write_mem_msg(0);
-        if (err)
-                error_packet(remcom_out_buffer, err);
-        else
-                strcpy(remcom_out_buffer, "OK");
-}
-/* Handle the 'X' memory binary write bytes */
-static void gdb_cmd_binwrite(struct kgdb_state *ks)
-{
-        int err = write_mem_msg(1);
-        if (err)
-                error_packet(remcom_out_buffer, err);
-        else
-                strcpy(remcom_out_buffer, "OK");
-}
-/* Handle the 'D' or 'k', detach or kill packets */
-static void gdb_cmd_detachkill(struct kgdb_state *ks)
-{
-        int error;
-        /* The detach case */
-        if (remcom_in_buffer[0] == 'D') {
-                error = remove_all_break();
-                if (error < 0) {
-                        error_packet(remcom_out_buffer, error);
-                } else {
-                        strcpy(remcom_out_buffer, "OK");
-                        kgdb_connected = 0;
-                }
-                put_packet(remcom_out_buffer);
-        } else {
-                /*
-                 * Assume the kill case, with no exit code checking,
-                 * trying to force detach the debugger:
-                 */
-                remove_all_break();
-                kgdb_connected = 0;
-        }
-}
-/* Handle the 'R' reboot packets */
-static int gdb_cmd_reboot(struct kgdb_state *ks)
-{
-        /* For now, only honor R0 */
-        if (strcmp(remcom_in_buffer, "R0") == 0) {
-                printk(KERN_CRIT "Executing emergency reboot\n");
-                strcpy(remcom_out_buffer, "OK");
-                put_packet(remcom_out_buffer);
-                /*
-                 * Execution should not return from
-                 * machine_emergency_restart()
-                 */
-                machine_emergency_restart();
-                kgdb_connected = 0;
-                return 1;
-        }
-        return 0;
-}
-/* Handle the 'q' query packets */
-static void gdb_cmd_query(struct kgdb_state *ks)
-{
-        struct task_struct *g;
-        struct task_struct *p;
-        unsigned char thref[8];
-        char *ptr;
-        int i;
-        int cpu;
-        int finished = 0;
-        switch (remcom_in_buffer[1]) {
-        case 's':
-        case 'f':
-                if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
-                        error_packet(remcom_out_buffer, -EINVAL);
-                        break;
-                }
-                i = 0;
-                remcom_out_buffer[0] = 'm';
-                ptr = remcom_out_buffer + 1;
-                if (remcom_in_buffer[1] == 'f') {
-                        /* Each cpu is a shadow thread */
-                        for_each_online_cpu(cpu) {
-                                ks->thr_query = 0;
-                                int_to_threadref(thref, -cpu - 2);
-                                pack_threadid(ptr, thref);
-                                ptr += BUF_THREAD_ID_SIZE;
-                                *(ptr++) = ',';
-                                i++;
-                        }
-                }
-                do_each_thread(g, p) {
-                        if (i >= ks->thr_query && !finished) {
-                                int_to_threadref(thref, p->pid);
-                                pack_threadid(ptr, thref);
-                                ptr += BUF_THREAD_ID_SIZE;
-                                *(ptr++) = ',';
-                                ks->thr_query++;
-                                if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
-                                        finished = 1;
-                        }
-                        i++;
-                } while_each_thread(g, p);
-                *(--ptr) = '\0';
-                break;
-        case 'C':
-                /* Current thread id */
-                strcpy(remcom_out_buffer, "QC");
-                ks->threadid = shadow_pid(current->pid);
-                int_to_threadref(thref, ks->threadid);
-                pack_threadid(remcom_out_buffer + 2, thref);
-                break;
-        case 'T':
-                if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
-                        error_packet(remcom_out_buffer, -EINVAL);
-                        break;
-                }
-                ks->threadid = 0;
-                ptr = remcom_in_buffer + 17;
-                kgdb_hex2long(&ptr, &ks->threadid);
-                if (!getthread(ks->linux_regs, ks->threadid)) {
-                        error_packet(remcom_out_buffer, -EINVAL);
-                        break;
-                }
-                if ((int)ks->threadid > 0) {
-                        kgdb_mem2hex(getthread(ks->linux_regs,
-                                        ks->threadid)->comm,
-                                        remcom_out_buffer, 16);
-                } else {
-                        static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-                        sprintf(tmpstr, "shadowCPU%d",
-                                        (int)(-ks->threadid - 2));
-                        kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
-                }
-                break;
-        }
-}
-/* Handle the 'H' task query packets */
-static void gdb_cmd_task(struct kgdb_state *ks)
-{
-        struct task_struct *thread;
-        char *ptr;
-        switch (remcom_in_buffer[1]) {
-        case 'g':
-                ptr = &remcom_in_buffer[2];
-                kgdb_hex2long(&ptr, &ks->threadid);
-                thread = getthread(ks->linux_regs, ks->threadid);
-                if (!thread && ks->threadid > 0) {
-                        error_packet(remcom_out_buffer, -EINVAL);
-                        break;
-                }
-                kgdb_usethread = thread;
-                ks->kgdb_usethreadid = ks->threadid;
-                strcpy(remcom_out_buffer, "OK");
-                break;
-        case 'c':
-                ptr = &remcom_in_buffer[2];
-                kgdb_hex2long(&ptr, &ks->threadid);
-                if (!ks->threadid) {
-                        kgdb_contthread = NULL;
-                } else {
-                        thread = getthread(ks->linux_regs, ks->threadid);
-                        if (!thread && ks->threadid > 0) {
-                                error_packet(remcom_out_buffer, -EINVAL);
-                                break;
-                        }
-                        kgdb_contthread = thread;
-                }
-                strcpy(remcom_out_buffer, "OK");
-                break;
-        }
-}
-/* Handle the 'T' thread query packets */
-static void gdb_cmd_thread(struct kgdb_state *ks)
-{
-        char *ptr = &remcom_in_buffer[1];
-        struct task_struct *thread;
-        kgdb_hex2long(&ptr, &ks->threadid);
-        thread = getthread(ks->linux_regs, ks->threadid);
-        if (thread)
-                strcpy(remcom_out_buffer, "OK");
-        else
-                error_packet(remcom_out_buffer, -EINVAL);
-}
-/* Handle the 'z' or 'Z' breakpoint remove or set packets */
-static void gdb_cmd_break(struct kgdb_state *ks)
-{
-        /*
-         * Since GDB-5.3, it's been drafted that '0' is a software
-         * breakpoint, '1' is a hardware breakpoint, so let's do that.
-         */
-        char *bpt_type = &remcom_in_buffer[1];
-        char *ptr = &remcom_in_buffer[2];
-        unsigned long addr;
-        unsigned long length;
-        int error = 0;
-        if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
-                /* Unsupported */
-                if (*bpt_type > '4')
-                        return;
-        } else {
-                if (*bpt_type != '0' && *bpt_type != '1')
-                        /* Unsupported. */
-                        return;
-        }
-        /*
-         * Test if this is a hardware breakpoint, and
-         * if we support it:
-         */
-        if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
-                /* Unsupported. */
-                return;
-        if (*(ptr++) != ',') {
-                error_packet(remcom_out_buffer, -EINVAL);
-                return;
-        }
-        if (!kgdb_hex2long(&ptr, &addr)) {
-                error_packet(remcom_out_buffer, -EINVAL);
-                return;
-        }
-        if (*(ptr++) != ',' ||
-                !kgdb_hex2long(&ptr, &length)) {
-                error_packet(remcom_out_buffer, -EINVAL);
-                return;
-        }
-        if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
-                error = kgdb_set_sw_break(addr);
-        else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
-                error = kgdb_remove_sw_break(addr);
-        else if (remcom_in_buffer[0] == 'Z')
-                error = arch_kgdb_ops.set_hw_breakpoint(addr,
-                        (int)length, *bpt_type - '0');
-        else if (remcom_in_buffer[0] == 'z')
-                error = arch_kgdb_ops.remove_hw_breakpoint(addr,
-                        (int) length, *bpt_type - '0');
-        if (error == 0)
-                strcpy(remcom_out_buffer, "OK");
-        else
-                error_packet(remcom_out_buffer, error);
-}
-/* Handle the 'C' signal / exception passing packets */
-static int gdb_cmd_exception_pass(struct kgdb_state *ks)
-{
-        /* C09 == pass exception
-         * C15 == detach kgdb, pass exception
-         */
-        if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
-                ks->pass_exception = 1;
-                remcom_in_buffer[0] = 'c';
-        } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
-                ks->pass_exception = 1;
-                remcom_in_buffer[0] = 'D';
-                remove_all_break();
-                kgdb_connected = 0;
-                return 1;
-        } else {
-                kgdb_msg_write("KGDB only knows signal 9 (pass)"
-                        " and 15 (pass and disconnect)\n"
-                        "Executing a continue without signal passing\n", 0);
-                remcom_in_buffer[0] = 'c';
-        }
-        /* Indicate fall through */
-        return -1;
-}
-/*
- * This function performs all gdbserial command procesing
- */
-static int gdb_serial_stub(struct kgdb_state *ks)
-{
-        int error = 0;
-        int tmp;
-        /* Clear the out buffer. */
-        memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-        if (kgdb_connected) {
-                unsigned char thref[8];
-                char *ptr;
-                /* Reply to host that an exception has occurred */
-                ptr = remcom_out_buffer;
-                *ptr++ = 'T';
-                ptr = pack_hex_byte(ptr, ks->signo);
-                ptr += strlen(strcpy(ptr, "thread:"));
-                int_to_threadref(thref, shadow_pid(current->pid));
-                ptr = pack_threadid(ptr, thref);
-                *ptr++ = ';';
-                put_packet(remcom_out_buffer);
-        }
-        kgdb_usethread = kgdb_info[ks->cpu].task;
-        ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
-        ks->pass_exception = 0;
-        while (1) {
-                error = 0;
-                /* Clear the out buffer. */
-                memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
-                get_packet(remcom_in_buffer);
-                switch (remcom_in_buffer[0]) {
-                case '?': /* gdbserial status */
-                        gdb_cmd_status(ks);
-                        break;
-                case 'g': /* return the value of the CPU registers */
-                        gdb_cmd_getregs(ks);
-                        break;
-                case 'G': /* set the value of the CPU registers - return OK */
-                        gdb_cmd_setregs(ks);
-                        break;
-                case 'm': /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
-                        gdb_cmd_memread(ks);
-                        break;
-                case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-                        gdb_cmd_memwrite(ks);
-                        break;
-                case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
-                        gdb_cmd_binwrite(ks);
-                        break;
-                        /* kill or detach. KGDB should treat this like a
-                         * continue.
-                         */
-                case 'D': /* Debugger detach */
-                case 'k': /* Debugger detach via kill */
-                        gdb_cmd_detachkill(ks);
-                        goto default_handle;
-                case 'R': /* Reboot */
-                        if (gdb_cmd_reboot(ks))
-                                goto default_handle;
-                        break;
-                case 'q': /* query command */
-                        gdb_cmd_query(ks);
-                        break;
-                case 'H': /* task related */
-                        gdb_cmd_task(ks);
-                        break;
-                case 'T': /* Query thread status */
-                        gdb_cmd_thread(ks);
-                        break;
-                case 'z': /* Break point remove */
-                case 'Z': /* Break point set */
-                        gdb_cmd_break(ks);
-                        break;
-                case 'C': /* Exception passing */
-                        tmp = gdb_cmd_exception_pass(ks);
-                        if (tmp > 0)
-                                goto default_handle;
-                        if (tmp == 0)
-                                break;
-                        /* Fall through on tmp < 0 */
-                case 'c': /* Continue packet */
-                case 's': /* Single step packet */
-                        if (kgdb_contthread && kgdb_contthread != current) {
-                                /* Can't switch threads in kgdb */
-                                error_packet(remcom_out_buffer, -EINVAL);
-                                break;
-                        }
-                        kgdb_activate_sw_breakpoints();
-                        /* Fall through to default processing */
-                default:
-default_handle:
-                        error = kgdb_arch_handle_exception(ks->ex_vector,
-                                                ks->signo,
-                                                ks->err_code,
-                                                remcom_in_buffer,
-                                                remcom_out_buffer,
-                                                ks->linux_regs);
-                        /*
-                         * Leave cmd processing on error, detach,
-                         * kill, continue, or single step.
-                         */
-                        if (error >= 0 || remcom_in_buffer[0] == 'D' ||
-                            remcom_in_buffer[0] == 'k') {
-                                error = 0;
-                                goto kgdb_exit;
-                        }
-                }
-                /* reply to the request */
-                put_packet(remcom_out_buffer);
-        }
-kgdb_exit:
-        if (ks->pass_exception)
-                error = 1;
-        return error;
-}
-static int kgdb_reenter_check(struct kgdb_state *ks)
-{
-        unsigned long addr;
-        if (atomic_read(&kgdb_active) != raw_smp_processor_id())
-                return 0;
-        /* Panic on recursive debugger calls: */
-        exception_level++;
-        addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
-        kgdb_deactivate_sw_breakpoints();
-        /*
-         * If the break point removed ok at the place exception
-         * occurred, try to recover and print a warning to the end
-         * user because the user planted a breakpoint in a place that
-         * KGDB needs in order to function.
-         */
-        if (kgdb_remove_sw_break(addr) == 0) {
-                exception_level = 0;
-                kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-                kgdb_activate_sw_breakpoints();
-                printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
-                        addr);
-                WARN_ON_ONCE(1);
-                return 1;
-        }
-        remove_all_break();
-        kgdb_skipexception(ks->ex_vector, ks->linux_regs);
-        if (exception_level > 1) {
-                dump_stack();
-                panic("Recursive entry to debugger");
-        }
-        printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
-        dump_stack();
-        panic("Recursive entry to debugger");
-        return 1;
-}
-/*
- * kgdb_handle_exception() - main entry point from a kernel exception
- *
- * Locking hierarchy:
- *      interface locks, if any (begin_session)
- *      kgdb lock (kgdb_active)
- */
-int
-kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
-{
-        struct kgdb_state kgdb_var;
-        struct kgdb_state *ks = &kgdb_var;
-        unsigned long flags;
-        int sstep_tries = 100;
-        int error = 0;
-        int i, cpu;
-        ks->cpu                 = raw_smp_processor_id();
-        ks->ex_vector           = evector;
-        ks->signo               = signo;
-        ks->ex_vector           = evector;
-        ks->err_code            = ecode;
-        ks->kgdb_usethreadid    = 0;
-        ks->linux_regs          = regs;
-        if (kgdb_reenter_check(ks))
-                return 0; /* Ouch, double exception ! */
-acquirelock:
-        /*
-         * Interrupts will be restored by the 'trap return' code, except when
-         * single stepping.
-         */
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        /*
-         * Acquire the kgdb_active lock:
-         */
-        while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
-                cpu_relax();
-        /*
-         * For single stepping, try to only enter on the processor
-         * that was single stepping.  To gaurd against a deadlock, the
-         * kernel will only try for the value of sstep_tries before
-         * giving up and continuing on.
-         */
-        if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-            (kgdb_info[cpu].task &&
-             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
-                atomic_set(&kgdb_active, -1);
-                touch_softlockup_watchdog();
-                clocksource_touch_watchdog();
-                local_irq_restore(flags);
-                goto acquirelock;
-        }
-        if (!kgdb_io_ready(1)) {
-                error = 1;
-                goto kgdb_restore; /* No I/O connection, so resume the system */
-        }
-        /*
-         * Don't enter if we have hit a removed breakpoint.
-         */
-        if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
-                goto kgdb_restore;
-        /* Call the I/O driver's pre_exception routine */
-        if (kgdb_io_ops->pre_exception)
-                kgdb_io_ops->pre_exception();
-        kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
-        kgdb_info[ks->cpu].task = current;
-        kgdb_disable_hw_debug(ks->linux_regs);
-        /*
-         * Get the passive CPU lock which will hold all the non-primary
-         * CPU in a spin state while the debugger is active
-         */
-        if (!kgdb_single_step) {
-                for (i = 0; i < NR_CPUS; i++)
-                        atomic_set(&passive_cpu_wait[i], 1);
-        }
-        /*
-         * spin_lock code is good enough as a barrier so we don't
-         * need one here:
-         */
-        atomic_set(&cpu_in_kgdb[ks->cpu], 1);
-#ifdef CONFIG_SMP
-        /* Signal the other CPUs to enter kgdb_wait() */
-        if ((!kgdb_single_step) && kgdb_do_roundup)
-                kgdb_roundup_cpus(flags);
-#endif
-        /*
-         * Wait for the other CPUs to be notified and be waiting for us:
-         */
-        for_each_online_cpu(i) {
-                while (!atomic_read(&cpu_in_kgdb[i]))
-                        cpu_relax();
-        }
-        /*
-         * At this point the primary processor is completely
-         * in the debugger and all secondary CPUs are quiescent
-         */
-        kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
-        kgdb_deactivate_sw_breakpoints();
-        kgdb_single_step = 0;
-        kgdb_contthread = current;
-        exception_level = 0;
-        /* Talk to debugger with gdbserial protocol */
-        error = gdb_serial_stub(ks);
-        /* Call the I/O driver's post_exception routine */
-        if (kgdb_io_ops->post_exception)
-                kgdb_io_ops->post_exception();
-        kgdb_info[ks->cpu].debuggerinfo = NULL;
-        kgdb_info[ks->cpu].task = NULL;
-        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
-        if (!kgdb_single_step) {
-                for (i = NR_CPUS-1; i >= 0; i--)
-                        atomic_set(&passive_cpu_wait[i], 0);
-                /*
-                 * Wait till all the CPUs have quit
-                 * from the debugger.
-                 */
-                for_each_online_cpu(i) {
-                        while (atomic_read(&cpu_in_kgdb[i]))
-                                cpu_relax();
-                }
-        }
-kgdb_restore:
-        if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
-                int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
-                if (kgdb_info[sstep_cpu].task)
-                        kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
-                else
-                        kgdb_sstep_pid = 0;
-        }
-        /* Free kgdb_active */
-        atomic_set(&kgdb_active, -1);
-        touch_softlockup_watchdog();
-        clocksource_touch_watchdog();
-        local_irq_restore(flags);
-        return error;
-}
-int kgdb_nmicallback(int cpu, void *regs)
-{
-#ifdef CONFIG_SMP
-        if (!atomic_read(&cpu_in_kgdb[cpu]) &&
-                        atomic_read(&kgdb_active) != cpu &&
-                        atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
-                kgdb_wait((struct pt_regs *)regs);
-                return 0;
-        }
-#endif
-        return 1;
-}
-static void kgdb_console_write(struct console *co, const char *s,
-   unsigned count)
-{
-        unsigned long flags;
-        /* If we're debugging, or KGDB has not connected, don't try
-         * and print. */
-        if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
-                return;
-        local_irq_save(flags);
-        kgdb_msg_write(s, count);
-        local_irq_restore(flags);
-}
-static struct console kgdbcons = {
-        .name           = "kgdb",
-        .write          = kgdb_console_write,
-        .flags          = CON_PRINTBUFFER | CON_ENABLED,
-        .index          = -1,
-};
-#ifdef CONFIG_MAGIC_SYSRQ
-static void sysrq_handle_gdb(int key, struct tty_struct *tty)
-{
-        if (!kgdb_io_ops) {
-                printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
-                return;
-        }
-        if (!kgdb_connected)
-                printk(KERN_CRIT "Entering KGDB\n");
-        kgdb_breakpoint();
-}
-static struct sysrq_key_op sysrq_gdb_op = {
-        .handler        = sysrq_handle_gdb,
-        .help_msg       = "debug(G)",
-        .action_msg     = "DEBUG",
-};
-#endif
-static void kgdb_register_callbacks(void)
-{
-        if (!kgdb_io_module_registered) {
-                kgdb_io_module_registered = 1;
-                kgdb_arch_init();
-#ifdef CONFIG_MAGIC_SYSRQ
-                register_sysrq_key('g', &sysrq_gdb_op);
-#endif
-                if (kgdb_use_con && !kgdb_con_registered) {
-                        register_console(&kgdbcons);
-                        kgdb_con_registered = 1;
-                }
-        }
-}
-static void kgdb_unregister_callbacks(void)
-{
-        /*
-         * When this routine is called KGDB should unregister from the
-         * panic handler and clean up, making sure it is not handling any
-         * break exceptions at the time.
-         */
-        if (kgdb_io_module_registered) {
-                kgdb_io_module_registered = 0;
-                kgdb_arch_exit();
-#ifdef CONFIG_MAGIC_SYSRQ
-                unregister_sysrq_key('g', &sysrq_gdb_op);
-#endif
-                if (kgdb_con_registered) {
-                        unregister_console(&kgdbcons);
-                        kgdb_con_registered = 0;
-                }
-        }
-}
-static void kgdb_initial_breakpoint(void)
-{
-        kgdb_break_asap = 0;
-        printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
-        kgdb_breakpoint();
-}
-/**
- *      kgdb_register_io_module - register KGDB IO module
- *      @new_kgdb_io_ops: the io ops vector
- *
- *      Register it with the KGDB core.
- */
-int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
-{
-        int err;
-        spin_lock(&kgdb_registration_lock);
-        if (kgdb_io_ops) {
-                spin_unlock(&kgdb_registration_lock);
-                printk(KERN_ERR "kgdb: Another I/O driver is already "
-                                "registered with KGDB.\n");
-                return -EBUSY;
-        }
-        if (new_kgdb_io_ops->init) {
-                err = new_kgdb_io_ops->init();
-                if (err) {
-                        spin_unlock(&kgdb_registration_lock);
-                        return err;
-                }
-        }
-        kgdb_io_ops = new_kgdb_io_ops;
-        spin_unlock(&kgdb_registration_lock);
-        printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
-               new_kgdb_io_ops->name);
-        /* Arm KGDB now. */
-        kgdb_register_callbacks();
-        if (kgdb_break_asap)
-                kgdb_initial_breakpoint();
-        return 0;
-}
-EXPORT_SYMBOL_GPL(kgdb_register_io_module);
-/**
- *      kkgdb_unregister_io_module - unregister KGDB IO module
- *      @old_kgdb_io_ops: the io ops vector
- *
- *      Unregister it with the KGDB core.
- */
-void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
-{
-        BUG_ON(kgdb_connected);
-        /*
-         * KGDB is no longer able to communicate out, so
-         * unregister our callbacks and reset state.
-         */
-        kgdb_unregister_callbacks();
-        spin_lock(&kgdb_registration_lock);
-        WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
-        kgdb_io_ops = NULL;
-        spin_unlock(&kgdb_registration_lock);
-        printk(KERN_INFO
-                "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
-                old_kgdb_io_ops->name);
-}
-EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
-/**
- * kgdb_breakpoint - generate breakpoint exception
- *
- * This function will generate a breakpoint exception.  It is used at the
- * beginning of a program to sync up with a debugger and can be used
- * otherwise as a quick means to stop program execution and "break" into
- * the debugger.
- */
-void kgdb_breakpoint(void)
-{
-        atomic_set(&kgdb_setting_breakpoint, 1);
-        wmb(); /* Sync point before breakpoint */
-        arch_kgdb_breakpoint();
-        wmb(); /* Sync point after breakpoint */
-        atomic_set(&kgdb_setting_breakpoint, 0);
-}
-EXPORT_SYMBOL_GPL(kgdb_breakpoint);
-static int __init opt_kgdb_wait(char *str)
-{
-        kgdb_break_asap = 1;
-        if (kgdb_io_module_registered)
-                kgdb_initial_breakpoint();
-        return 0;
-}
-early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..6e9b19667a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
        trace_module_request(module_name, wait, _RET_IP_);
-        ret = call_usermodehelper(modprobe_path, argv, envp,
+        ret = call_usermodehelper_fns(modprobe_path, argv, envp,
-                        wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+                        wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
+                        NULL, NULL, NULL);
        atomic_dec(&kmod_concurrent);
        return ret;
 }
 EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
-struct subprocess_info {
-        struct work_struct work;
-        struct completion *complete;
-        struct cred *cred;
-        char *path;
-        char **argv;
-        char **envp;
-        enum umh_wait wait;
-        int retval;
-        struct file *stdin;
-        void (*cleanup)(char **argv, char **envp);
-};
 /*
 * This is the task which runs the usermode application
 */
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
        struct subprocess_info *sub_info = data;
        int retval;
-        BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-        /* Unblock all signals */
        spin_lock_irq(&current->sighand->siglock);
        flush_signal_handlers(current, 1);
-        sigemptyset(&current->blocked);
-        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
-        /* Install the credentials */
-        commit_creds(sub_info->cred);
-        sub_info->cred = NULL;
-        /* Install input pipe when needed */
-        if (sub_info->stdin) {
-                struct files_struct *f = current->files;
-                struct fdtable *fdt;
-                /* no races because files should be private here */
-                sys_close(0);
-                fd_install(0, sub_info->stdin);
-                spin_lock(&f->file_lock);
-                fdt = files_fdtable(f);
-                FD_SET(0, fdt->open_fds);
-                FD_CLR(0, fdt->close_on_exec);
-                spin_unlock(&f->file_lock);
-                /* and disallow core files too */
-                current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
-        }
        /* We can run anywhere, unlike our parent keventd(). */
        set_cpus_allowed_ptr(current, cpu_all_mask);
@@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data)
         */
        set_user_nice(current, 0);
+        if (sub_info->init) {
+                retval = sub_info->init(sub_info);
+                if (retval)
+                        goto fail;
+        }
        retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
        /* Exec failed? */
+fail:
        sub_info->retval = retval;
        do_exit(0);
 }
@@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data)
 void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
        if (info->cleanup)
-                (*info->cleanup)(info->argv, info->envp);
+                (*info->cleanup)(info);
-        if (info->cred)
-                put_cred(info->cred);
        kfree(info);
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +175,16 @@ static int wait_for_helper(void *data)
        struct subprocess_info *sub_info = data;
        pid_t pid;
-        /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
+        /* If SIGCLD is ignored sys_wait4 won't populate the status. */
-         * populate the status, but will return -ECHILD. */
+        spin_lock_irq(&current->sighand->siglock);
-        allow_signal(SIGCHLD);
+        current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
+        spin_unlock_irq(&current->sighand->siglock);
        pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
        if (pid < 0) {
                sub_info->retval = pid;
        } else {
-                int ret;
+                int ret = -ECHILD;
                /*
                 * Normally it is bogus to call wait4() from in-kernel because
                 * wait4() wants to write the exit code to a userspace address.
@@ -237,10 +205,7 @@ static int wait_for_helper(void *data)
                        sub_info->retval = ret;
        }
-        if (sub_info->wait == UMH_NO_WAIT)
+        complete(sub_info->complete);
-                call_usermodehelper_freeinfo(sub_info);
-        else
-                complete(sub_info->complete);
        return 0;
 }
@@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-        pid_t pid;
        enum umh_wait wait = sub_info->wait;
+        pid_t pid;
-        BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
        /* CLONE_VFORK: wait until the usermode helper has execve'd
         * successfully We need the data structures to stay around
         * until that is done.  */
-        if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT)
+        if (wait == UMH_WAIT_PROC)
                pid = kernel_thread(wait_for_helper, sub_info,
                                    CLONE_FS | CLONE_FILES | SIGCHLD);
        else
@@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work)
        switch (wait) {
        case UMH_NO_WAIT:
+                call_usermodehelper_freeinfo(sub_info);
                break;
        case UMH_WAIT_PROC:
                if (pid > 0)
                        break;
-                sub_info->retval = pid;
                /* FALLTHROUGH */
        case UMH_WAIT_EXEC:
+                if (pid < 0)
+                        sub_info->retval = pid;
                complete(sub_info->complete);
        }
 }
@@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
        sub_info->path = path;
        sub_info->argv = argv;
        sub_info->envp = envp;
-        sub_info->cred = prepare_usermodehelper_creds();
-        if (!sub_info->cred) {
-                kfree(sub_info);
-                return NULL;
-        }
  out:
        return sub_info;
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 /**
- * call_usermodehelper_setkeys - set the session keys for usermode helper
+ * call_usermodehelper_setfns - set a cleanup/init function
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @session_keyring: the session keyring for the process
- */
-void call_usermodehelper_setkeys(struct subprocess_info *info,
-                                 struct key *session_keyring)
-{
-#ifdef CONFIG_KEYS
-        struct thread_group_cred *tgcred = info->cred->tgcred;
-        key_put(tgcred->session_keyring);
-        tgcred->session_keyring = key_get(session_keyring);
-#else
-        BUG();
-#endif
-}
-EXPORT_SYMBOL(call_usermodehelper_setkeys);
-/**
- * call_usermodehelper_setcleanup - set a cleanup function
 * @info: a subprocess_info returned by call_usermodehelper_setup
 * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
 *
- * The cleanup function is just befor ethe subprocess_info is about to
+ * The init function is used to customize the helper process prior to
+ * exec.  A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
 * be freed.  This can be used for freeing the argv and envp.  The
 * Function must be runnable in either a process context or the
 * context in which call_usermodehelper_exec is called.
 */
-void call_usermodehelper_setcleanup(struct subprocess_info *info,
+void call_usermodehelper_setfns(struct subprocess_info *info,
-                                    void (*cleanup)(char **argv, char **envp))
+                    int (*init)(struct subprocess_info *info),
+                    void (*cleanup)(struct subprocess_info *info),
+                    void *data)
 {
        info->cleanup = cleanup;
+        info->init = init;
+        info->data = data;
 }
-EXPORT_SYMBOL(call_usermodehelper_setcleanup);
+EXPORT_SYMBOL(call_usermodehelper_setfns);
-/**
- * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
- * @sub_info: a subprocess_info returned by call_usermodehelper_setup
- * @filp: set to the write-end of a pipe
- *
- * This constructs a pipe, and sets the read end to be the stdin of the
- * subprocess, and returns the write-end in *@filp.
- */
-int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
-                                  struct file **filp)
-{
-        struct file *f;
-        f = create_write_pipe(0);
-        if (IS_ERR(f))
-                return PTR_ERR(f);
-        *filp = f;
-        f = create_read_pipe(f, 0);
-        if (IS_ERR(f)) {
-                free_write_pipe(*filp);
-                return PTR_ERR(f);
-        }
-        sub_info->stdin = f;
-        return 0;
-}
-EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
 /**
 * call_usermodehelper_exec - start a usermode application
@@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
        DECLARE_COMPLETION_ONSTACK(done);
        int retval = 0;
-        BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
-        validate_creds(sub_info->cred);
        helper_lock();
        if (sub_info->path[0] == '\0')
                goto out;
@@ -498,41 +416,6 @@ unlock:
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
-/**
- * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
- * @path: path to usermode executable
- * @argv: arg vector for process
- * @envp: environment for process
- * @filp: set to the write-end of a pipe
- *
- * This is a simple wrapper which executes a usermode-helper function
- * with a pipe as stdin.  It is implemented entirely in terms of
- * lower-level call_usermodehelper_* functions.
- */
-int call_usermodehelper_pipe(char *path, char **argv, char **envp,
-                             struct file **filp)
-{
-        struct subprocess_info *sub_info;
-        int ret;
-        sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
-        if (sub_info == NULL)
-                return -ENOMEM;
-        ret = call_usermodehelper_stdinpipe(sub_info, filp);
-        if (ret < 0) {
-                call_usermodehelper_freeinfo(sub_info);
-                return ret;
-        }
-        ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-        if (ret < 0)    /* Failed to execute helper, close pipe */
-                filp_close(*filp, NULL);
-        return ret;
-}
-EXPORT_SYMBOL(call_usermodehelper_pipe);
 void __init usermodehelper_init(void)
 {
        khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/sysctl.h>
 #include <linux/kdebug.h>
 #include <linux/memory.h>
+#include <linux/ftrace.h>
+#include <linux/cpu.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -93,6 +96,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
        {"native_get_debugreg",},
        {"irq_entries_start",},
        {"common_interrupt",},
+        {"mcount",},    /* mcount can be called from everywhere */
        {NULL}    /* Terminator */
 };
@@ -103,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
-#define INSNS_PER_PAGE  (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
 struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;         /* Page of instruction slots */
-        char slot_used[INSNS_PER_PAGE];
        int nused;
        int ngarbage;
+        char slot_used[];
+};
+#define KPROBE_INSN_PAGE_SIZE(slots)                    \
+        (offsetof(struct kprobe_insn_page, slot_used) + \
+         (sizeof(char) * (slots)))
+struct kprobe_insn_cache {
+        struct list_head pages; /* list of kprobe_insn_page */
+        size_t insn_size;       /* size of instruction slot */
+        int nr_garbage;
 };
+static int slots_per_page(struct kprobe_insn_cache *c)
+{
+        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
+}
 enum kprobe_slot_state {
        SLOT_CLEAN = 0,
        SLOT_DIRTY = 1,
        SLOT_USED = 2,
 };
-static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
+static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
-static LIST_HEAD(kprobe_insn_pages);
+static struct kprobe_insn_cache kprobe_insn_slots = {
-static int kprobe_garbage_slots;
+        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
-static int collect_garbage_slots(void);
+        .insn_size = MAX_INSN_SIZE,
+        .nr_garbage = 0,
-static int __kprobes check_safety(void)
+};
-{
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
-        int ret = 0;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
-        ret = freeze_processes();
-        if (ret == 0) {
-                struct task_struct *p, *q;
-                do_each_thread(p, q) {
-                        if (p != current && p->state == TASK_RUNNING &&
-                            p->pid != 0) {
-                                printk("Check failed: %s is running\n",p->comm);
-                                ret = -1;
-                                goto loop_end;
-                        }
-                } while_each_thread(p, q);
-        }
-loop_end:
-        thaw_processes();
-#else
-        synchronize_sched();
-#endif
-        return ret;
-}
 /**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-static kprobe_opcode_t __kprobes *__get_insn_slot(void)
+static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip;
 retry:
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+        list_for_each_entry(kip, &c->pages, list) {
-                if (kip->nused < INSNS_PER_PAGE) {
+                if (kip->nused < slots_per_page(c)) {
                        int i;
-                        for (i = 0; i < INSNS_PER_PAGE; i++) {
+                        for (i = 0; i < slots_per_page(c); i++) {
                                if (kip->slot_used[i] == SLOT_CLEAN) {
                                        kip->slot_used[i] = SLOT_USED;
                                        kip->nused++;
-                                        return kip->insns + (i * MAX_INSN_SIZE);
+                                        return kip->insns + (i * c->insn_size);
                                }
                        }
-                        /* Surprise!  No unused slots.  Fix kip->nused. */
+                        /* kip->nused is broken. Fix it. */
-                        kip->nused = INSNS_PER_PAGE;
+                        kip->nused = slots_per_page(c);
+                        WARN_ON(1);
                }
        }
        /* If there are any garbage slots, collect it and try again. */
-        if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+        if (c->nr_garbage && collect_garbage_slots(c) == 0)
                goto retry;
-        }
-        /* All out of space.  Need to allocate a new page. Use slot 0. */
+        /* All out of space.  Need to allocate a new page. */
-        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+        kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
        if (!kip)
                return NULL;
@@ -192,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
                return NULL;
        }
        INIT_LIST_HEAD(&kip->list);
-        list_add(&kip->list, &kprobe_insn_pages);
+        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
-        memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
+        list_add(&kip->list, &c->pages);
        return kip->insns;
 }
 kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
-        kprobe_opcode_t *ret;
+        kprobe_opcode_t *ret = NULL;
        mutex_lock(&kprobe_insn_mutex);
-        ret = __get_insn_slot();
+        ret = __get_insn_slot(&kprobe_insn_slots);
        mutex_unlock(&kprobe_insn_mutex);
        return ret;
 }
@@ -221,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
                 * so as not to have to set it up again the
                 * next time somebody inserts a probe.
                 */
-                if (!list_is_singular(&kprobe_insn_pages)) {
+                if (!list_is_singular(&kip->list)) {
                        list_del(&kip->list);
                        module_free(NULL, kip->insns);
                        kfree(kip);
@@ -231,52 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
        return 0;
 }
-static int __kprobes collect_garbage_slots(void)
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip, *next;
-        /* Ensure no-one is preepmted on the garbages */
+        /* Ensure no-one is interrupted on the garbages */
-        if (check_safety())
+        synchronize_sched();
-                return -EAGAIN;
-        list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
+        list_for_each_entry_safe(kip, next, &c->pages, list) {
                int i;
                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;      /* we will collect all garbages */
-                for (i = 0; i < INSNS_PER_PAGE; i++) {
+                for (i = 0; i < slots_per_page(c); i++) {
                        if (kip->slot_used[i] == SLOT_DIRTY &&
                            collect_one_slot(kip, i))
                                break;
                }
        }
-        kprobe_garbage_slots = 0;
+        c->nr_garbage = 0;
        return 0;
 }
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+                                       kprobe_opcode_t *slot, int dirty)
 {
        struct kprobe_insn_page *kip;
-        mutex_lock(&kprobe_insn_mutex);
+        list_for_each_entry(kip, &c->pages, list) {
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+                long idx = ((long)slot - (long)kip->insns) /
-                if (kip->insns <= slot &&
+                                (c->insn_size * sizeof(kprobe_opcode_t));
-                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
+                if (idx >= 0 && idx < slots_per_page(c)) {
-                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
+                        WARN_ON(kip->slot_used[idx] != SLOT_USED);
                        if (dirty) {
-                                kip->slot_used[i] = SLOT_DIRTY;
+                                kip->slot_used[idx] = SLOT_DIRTY;
                                kip->ngarbage++;
+                                if (++c->nr_garbage > slots_per_page(c))
+                                        collect_garbage_slots(c);
                        } else
-                                collect_one_slot(kip, i);
+                                collect_one_slot(kip, idx);
-                        break;
+                        return;
                }
        }
+        /* Could not free this slot. */
+        WARN_ON(1);
+}
-        if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
-                collect_garbage_slots();
+{
+        mutex_lock(&kprobe_insn_mutex);
+        __free_insn_slot(&kprobe_insn_slots, slot, dirty);
        mutex_unlock(&kprobe_insn_mutex);
 }
+#ifdef CONFIG_OPTPROBES
+/* For optimized_kprobe buffer */
+static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
+static struct kprobe_insn_cache kprobe_optinsn_slots = {
+        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
+        /* .insn_size is initialized later */
+        .nr_garbage = 0,
+};
+/* Get a slot for optimized_kprobe buffer */
+kprobe_opcode_t __kprobes *get_optinsn_slot(void)
+{
+        kprobe_opcode_t *ret = NULL;
+        mutex_lock(&kprobe_optinsn_mutex);
+        ret = __get_insn_slot(&kprobe_optinsn_slots);
+        mutex_unlock(&kprobe_optinsn_mutex);
+        return ret;
+}
+void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
+{
+        mutex_lock(&kprobe_optinsn_mutex);
+        __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
+        mutex_unlock(&kprobe_optinsn_mutex);
+}
+#endif
 #endif
 /* We have preemption disabled.. so it is safe to use __ versions */
@@ -307,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
                if (p->addr == addr)
                        return p;
        }
        return NULL;
 }
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+/* Return true if the kprobe is an aggregator */
+static inline int kprobe_aggrprobe(struct kprobe *p)
+{
+        return p->pre_handler == aggr_pre_handler;
+}
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+#ifdef CONFIG_OPTPROBES
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_allow_optimization;
+/*
+ * Call all pre_handler on the list, but ignores its return value.
+ * This must be called from arch-dep optimized caller.
+ */
+void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &p->list, list) {
+                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+                        set_kprobe_instance(kp);
+                        kp->pre_handler(kp, regs);
+                }
+                reset_kprobe_instance();
+        }
+}
+/* Return true(!0) if the kprobe is ready for optimization. */
+static inline int kprobe_optready(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                return arch_prepared_optinsn(&op->optinsn);
+        }
+        return 0;
+}
+/*
+ * Return an optimized kprobe whose optimizing code replaces
+ * instructions including addr (exclude breakpoint).
+ */
+struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+{
+        int i;
+        struct kprobe *p = NULL;
+        struct optimized_kprobe *op;
+        /* Don't check i == 0, since that is a breakpoint case. */
+        for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
+                p = get_kprobe((void *)(addr - i));
+        if (p && kprobe_optready(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (arch_within_optimized_kprobe(op, addr))
+                        return p;
+        }
+        return NULL;
+}
+/* Optimization staging list, protected by kprobe_mutex */
+static LIST_HEAD(optimizing_list);
+static void kprobe_optimizer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+#define OPTIMIZE_DELAY 5
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        if (kprobes_all_disarmed || !kprobes_allow_optimization)
+                goto end;
+        /*
+         * Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /*
+         * The optimization/unoptimization refers online_cpus via
+         * stop_machine() and cpu-hotplug modifies online_cpus.
+         * And same time, text_mutex will be held in cpu-hotplug and here.
+         * This combination can cause a deadlock (cpu-hotplug try to lock
+         * text_mutex but stop_machine can not be done because online_cpus
+         * has been changed)
+         * To avoid this deadlock, we need to call get_online_cpus()
+         * for preventing cpu-hotplug outside of text_mutex locking.
+         */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+                WARN_ON(kprobe_disabled(&op->kp));
+                if (arch_optimize_kprobe(op) < 0)
+                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+                list_del_init(&op->list);
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+end:
+        mutex_unlock(&kprobe_mutex);
+        mutex_unlock(&module_mutex);
+}
+/* Optimize kprobe if p is ready to be optimized */
+static __kprobes void optimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* Check if the kprobe is disabled or not ready for optimization. */
+        if (!kprobe_optready(p) || !kprobes_allow_optimization ||
+            (kprobe_disabled(p) || kprobes_all_disarmed))
+                return;
+        /* Both of break_handler and post_handler are not supported. */
+        if (p->break_handler || p->post_handler)
+                return;
+        op = container_of(p, struct optimized_kprobe, kp);
+        /* Check there is no other kprobes at the optimized instructions */
+        if (arch_check_optimized_kprobe(op) < 0)
+                return;
+        /* Check if it is already optimized. */
+        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+                return;
+        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
+        list_add(&op->list, &optimizing_list);
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Unoptimize a kprobe if p is optimized */
+static __kprobes void unoptimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        /* Dequeue from the optimization queue */
+                        list_del_init(&op->list);
+                else
+                        /* Replace jump with break */
+                        arch_unoptimize_kprobe(op);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+}
+/* Remove optimized instructions */
+static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        /* Don't unoptimize, because the target code will be freed. */
+        arch_remove_optimized_kprobe(op);
+}
+/* Try to prepare optimized instructions */
+static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_prepare_optimized_kprobe(op);
+}
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        kfree(op);
+}
+/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
+        if (!op)
+                return NULL;
+        INIT_LIST_HEAD(&op->list);
+        op->kp.addr = p->addr;
+        arch_prepare_optimized_kprobe(op);
+        return &op->kp;
+}
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+/*
+ * Prepare an optimized_kprobe and optimize it
+ * NOTE: p must be a normal registered kprobe
+ */
+static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+{
+        struct kprobe *ap;
+        struct optimized_kprobe *op;
+        ap = alloc_aggr_kprobe(p);
+        if (!ap)
+                return;
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (!arch_prepared_optinsn(&op->optinsn)) {
+                /* If failed to setup optimizing, fallback to kprobe */
+                free_aggr_kprobe(ap);
+                return;
+        }
+        init_aggr_kprobe(ap, p);
+        optimize_kprobe(ap);
+}
+#ifdef CONFIG_SYSCTL
+static void __kprobes optimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already allowed, just return */
+        if (kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = true;
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist)
+                        if (!kprobe_disabled(p))
+                                optimize_kprobe(p);
+        }
+        mutex_unlock(&text_mutex);
+        printk(KERN_INFO "Kprobes globally optimized\n");
+}
+static void __kprobes unoptimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already prohibited, just return */
+        if (!kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = false;
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                        if (!kprobe_disabled(p))
+                                unoptimize_kprobe(p);
+                }
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+        /* Allow all currently running kprobes to complete */
+        synchronize_sched();
+}
+int sysctl_kprobes_optimization;
+int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
+                                      void __user *buffer, size_t *length,
+                                      loff_t *ppos)
+{
+        int ret;
+        mutex_lock(&kprobe_mutex);
+        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
+        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (sysctl_kprobes_optimization)
+                optimize_all_kprobes();
+        else
+                unoptimize_all_kprobes();
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
+#endif /* CONFIG_SYSCTL */
+static void __kprobes __arm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        /* Check collision with other optimized kprobes */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+        arch_arm_kprobe(p);
+        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
+}
+static void __kprobes __disarm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        unoptimize_kprobe(p);   /* Try to unoptimize */
+        arch_disarm_kprobe(p);
+        /* If another kprobe was blocked, optimize it. */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                optimize_kprobe(old_p);
+}
+#else /* !CONFIG_OPTPROBES */
+#define optimize_kprobe(p)                      do {} while (0)
+#define unoptimize_kprobe(p)                    do {} while (0)
+#define kill_optimized_kprobe(p)                do {} while (0)
+#define prepare_optimized_kprobe(p)             do {} while (0)
+#define try_to_optimize_kprobe(p)               do {} while (0)
+#define __arm_kprobe(p)                         arch_arm_kprobe(p)
+#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        kfree(p);
+}
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+}
+#endif /* CONFIG_OPTPROBES */
 /* Arm a kprobe with text_mutex */
 static void __kprobes arm_kprobe(struct kprobe *kp)
 {
+        /*
+         * Here, since __arm_kprobe() doesn't use stop_machine(),
+         * this doesn't cause deadlock on text_mutex. So, we don't
+         * need get_online_cpus().
+         */
        mutex_lock(&text_mutex);
-        arch_arm_kprobe(kp);
+        __arm_kprobe(kp);
        mutex_unlock(&text_mutex);
 }
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
        mutex_lock(&text_mutex);
-        arch_disarm_kprobe(kp);
+        __disarm_kprobe(kp);
        mutex_unlock(&text_mutex);
+        put_online_cpus();
 }
 /*
@@ -392,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 {
        struct kprobe *kp;
-        if (p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(p)) {
                p->nmissed++;
        } else {
                list_for_each_entry_rcu(kp, &p->list, list)
@@ -516,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 }
 /*
- * Keep all fields in the kprobe consistent
- */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
-{
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
-}
-/*
 * Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
+        if (p->break_handler || p->post_handler)
+                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
                        return -EEXIST;
@@ -545,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->flags &= ~KPROBE_FLAG_DISABLED;
                if (!kprobes_all_disarmed)
                        /* Arm the breakpoint again. */
-                        arm_kprobe(ap);
+                        __arm_kprobe(ap);
        }
        return 0;
 }
@@ -554,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
-static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+        /* Copy p's insn slot to ap */
        copy_kprobe(p, ap);
        flush_insn_slot(ap);
        ap->addr = p->addr;
-        ap->flags = p->flags;
+        ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
        /* We don't care the kprobe which has gone. */
@@ -569,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->break_handler = aggr_break_handler;
        INIT_LIST_HEAD(&ap->list);
-        list_add_rcu(&p->list, &ap->list);
+        INIT_HLIST_NODE(&ap->hlist);
+        list_add_rcu(&p->list, &ap->list);
        hlist_replace_rcu(&p->hlist, &ap->hlist);
 }
@@ -584,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        int ret = 0;
        struct kprobe *ap = old_p;
-        if (old_p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(old_p)) {
-                /* If old_p is not an aggr_probe, create new aggr_kprobe. */
+                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+                ap = alloc_aggr_kprobe(old_p);
                if (!ap)
                        return -ENOMEM;
-                add_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, old_p);
        }
        if (kprobe_gone(ap)) {
@@ -608,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                         */
                        return ret;
+                /* Prepare optimized instructions if possible. */
+                prepare_optimized_kprobe(ap);
                /*
                 * Clear gone flag to prevent allocating new slot again, and
                 * set disabled flag because it is not armed yet.
@@ -616,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                            | KPROBE_FLAG_DISABLED;
        }
+        /* Copy ap's insn slot to p */
        copy_kprobe(ap, p);
        return add_new_kprobe(ap, p);
 }
@@ -728,7 +1140,8 @@ int __kprobes register_kprobe(struct kprobe *p)
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
-            in_kprobes_functions((unsigned long) p->addr)) {
+            in_kprobes_functions((unsigned long) p->addr) ||
+            ftrace_text_reserved(p->addr, p->addr)) {
                preempt_enable();
                return -EINVAL;
        }
@@ -765,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
        mutex_lock(&kprobe_mutex);
+        get_online_cpus();      /* For avoiding text_mutex deadlock. */
+        mutex_lock(&text_mutex);
        old_p = get_kprobe(p->addr);
        if (old_p) {
+                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);
                goto out;
        }
-        mutex_lock(&text_mutex);
        ret = arch_prepare_kprobe(p);
        if (ret)
-                goto out_unlock_text;
+                goto out;
        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        if (!kprobes_all_disarmed && !kprobe_disabled(p))
-                arch_arm_kprobe(p);
+                __arm_kprobe(p);
+        /* Try to optimize kprobe */
+        try_to_optimize_kprobe(p);
-out_unlock_text:
-        mutex_unlock(&text_mutex);
 out:
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        if (probed_mod)
@@ -807,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                return -EINVAL;
        if (old_p == p ||
-            (old_p->pre_handler == aggr_pre_handler &&
+            (kprobe_aggrprobe(old_p) &&
             list_is_singular(&old_p->list))) {
                /*
                 * Only probe on the hash list. Disarm only if kprobes are
@@ -815,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                 * already have been removed. We save on flushing icache.
                 */
                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
-                        disarm_kprobe(p);
+                        disarm_kprobe(old_p);
                hlist_del_rcu(&old_p->hlist);
        } else {
                if (p->break_handler && !kprobe_gone(p))
@@ -831,8 +1251,13 @@ noclean:
                list_del_rcu(&p->list);
                if (!kprobe_disabled(old_p)) {
                        try_to_disable_aggr_kprobe(old_p);
-                        if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+                        if (!kprobes_all_disarmed) {
-                                disarm_kprobe(old_p);
+                                if (kprobe_disabled(old_p))
+                                        disarm_kprobe(old_p);
+                                else
+                                        /* Try to optimize this probe again */
+                                        optimize_kprobe(old_p);
+                        }
                }
        }
        return 0;
@@ -849,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
                old_p = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
                arch_remove_kprobe(old_p);
-                kfree(old_p);
+                free_aggr_kprobe(old_p);
        }
 }
@@ -1145,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        struct kprobe *kp;
        p->flags |= KPROBE_FLAG_GONE;
-        if (p->pre_handler == aggr_pre_handler) {
+        if (kprobe_aggrprobe(p)) {
                /*
                 * If this is an aggr_kprobe, we have to list all the
                 * chained probes and mark them GONE.
@@ -1154,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
                        kp->flags |= KPROBE_FLAG_GONE;
                p->post_handler = NULL;
                p->break_handler = NULL;
+                kill_optimized_kprobe(p);
        }
        /*
         * Here, we can remove insn_slot safely, because no thread calls
@@ -1162,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        arch_remove_kprobe(p);
 }
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+        int ret = 0;
+        struct kprobe *p;
+        mutex_lock(&kprobe_mutex);
+        /* Check whether specified probe is valid. */
+        p = __get_valid_kprobe(kp);
+        if (unlikely(p == NULL)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /* If the probe is already disabled (or gone), just return */
+        if (kprobe_disabled(kp))
+                goto out;
+        kp->flags |= KPROBE_FLAG_DISABLED;
+        if (p != kp)
+                /* When kp != p, p is always enabled. */
+                try_to_disable_aggr_kprobe(p);
+        if (!kprobes_all_disarmed && kprobe_disabled(p))
+                disarm_kprobe(p);
+out:
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+        int ret = 0;
+        struct kprobe *p;
+        mutex_lock(&kprobe_mutex);
+        /* Check whether specified probe is valid. */
+        p = __get_valid_kprobe(kp);
+        if (unlikely(p == NULL)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        if (kprobe_gone(kp)) {
+                /* This kprobe has gone, we couldn't enable it. */
+                ret = -EINVAL;
+                goto out;
+        }
+        if (p != kp)
+                kp->flags &= ~KPROBE_FLAG_DISABLED;
+        if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+                p->flags &= ~KPROBE_FLAG_DISABLED;
+                arm_kprobe(p);
+        }
+out:
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
 void __kprobes dump_kprobe(struct kprobe *kp)
 {
        printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1263,6 +1755,15 @@ static int __init init_kprobes(void)
                }
        }
+#if defined(CONFIG_OPTPROBES)
+#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+        /* Init kprobe_optinsn_slots */
+        kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+        /* By default, kprobes can be optimized */
+        kprobes_allow_optimization = true;
+#endif
        /* By default, kprobes are armed */
        kprobes_all_disarmed = false;
@@ -1281,7 +1782,7 @@ static int __init init_kprobes(void)
 #ifdef CONFIG_DEBUG_FS
 static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
-                const char *sym, int offset,char *modname)
+                const char *sym, int offset, char *modname, struct kprobe *pp)
 {
        char *kprobe_type;
@@ -1291,19 +1792,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
                kprobe_type = "j";
        else
                kprobe_type = "k";
        if (sym)
-                seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+                seq_printf(pi, "%p  %s  %s+0x%x  %s ",
                        p->addr, kprobe_type, sym, offset,
-                        (modname ? modname : " "),
+                        (modname ? modname : " "));
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
-                         "[DISABLED]" : ""));
        else
-                seq_printf(pi, "%p  %s  %p %s%s\n",
+                seq_printf(pi, "%p  %s  %p ",
-                        p->addr, kprobe_type, p->addr,
+                        p->addr, kprobe_type, p->addr);
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+        if (!pp)
-                         "[DISABLED]" : ""));
+                pp = p;
+        seq_printf(pi, "%s%s%s\n",
+                (kprobe_gone(p) ? "[GONE]" : ""),
+                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
+                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
 }
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1339,11 +1842,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        hlist_for_each_entry_rcu(p, node, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
-                if (p->pre_handler == aggr_pre_handler) {
+                if (kprobe_aggrprobe(p)) {
                        list_for_each_entry_rcu(kp, &p->list, list)
-                                report_probe(pi, kp, sym, offset, modname);
+                                report_probe(pi, kp, sym, offset, modname, p);
                } else
-                        report_probe(pi, p, sym, offset, modname);
+                        report_probe(pi, p, sym, offset, modname, NULL);
        }
        preempt_enable();
        return 0;
@@ -1368,71 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
        .release        = seq_release,
 };
-/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
-{
-        int ret = 0;
-        struct kprobe *p;
-        mutex_lock(&kprobe_mutex);
-        /* Check whether specified probe is valid. */
-        p = __get_valid_kprobe(kp);
-        if (unlikely(p == NULL)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        /* If the probe is already disabled (or gone), just return */
-        if (kprobe_disabled(kp))
-                goto out;
-        kp->flags |= KPROBE_FLAG_DISABLED;
-        if (p != kp)
-                /* When kp != p, p is always enabled. */
-                try_to_disable_aggr_kprobe(p);
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                disarm_kprobe(p);
-out:
-        mutex_unlock(&kprobe_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(disable_kprobe);
-/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
-{
-        int ret = 0;
-        struct kprobe *p;
-        mutex_lock(&kprobe_mutex);
-        /* Check whether specified probe is valid. */
-        p = __get_valid_kprobe(kp);
-        if (unlikely(p == NULL)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        if (kprobe_gone(kp)) {
-                /* This kprobe has gone, we couldn't enable it. */
-                ret = -EINVAL;
-                goto out;
-        }
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                arm_kprobe(p);
-        p->flags &= ~KPROBE_FLAG_DISABLED;
-        if (p != kp)
-                kp->flags &= ~KPROBE_FLAG_DISABLED;
-out:
-        mutex_unlock(&kprobe_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(enable_kprobe);
 static void __kprobes arm_all_kprobes(void)
 {
        struct hlist_head *head;
@@ -1446,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
        if (!kprobes_all_disarmed)
                goto already_enabled;
+        /* Arming kprobes doesn't optimize kprobe itself */
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
-                                arch_arm_kprobe(p);
+                                __arm_kprobe(p);
        }
        mutex_unlock(&text_mutex);
@@ -1478,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
+        /*
+         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
+         * because disarming may also unoptimize kprobes.
+         */
+        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                arch_disarm_kprobe(p);
+                                __disarm_kprobe(p);
                }
        }
        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        /* Allow all currently running kprobes to complete */
        synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(uevent_seqnum);
-/* uevent helper program, used during early boo */
+/* uevent helper program, used during early boot */
 static ssize_t uevent_helper_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
 {
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
 extern const void __stop_notes __attribute__((weak));
 #define notes_size (&__stop_notes - &__start_notes)
-static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr,
+static ssize_t notes_read(struct file *filp, struct kobject *kobj,
+                          struct bin_attribute *bin_attr,
                          char *buf, loff_t off, size_t count)
 {
        memcpy(buf, &__start_notes + off, count);
@@ -197,16 +198,8 @@ static int __init ksysfs_init(void)
                        goto group_exit;
        }
-        /* create the /sys/kernel/uids/ directory */
-        error = uids_sysfs_init();
-        if (error)
-                goto notes_exit;
        return 0;
-notes_exit:
-        if (notes_size > 0)
-                sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
        sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
+ * it.  See also kthread_run().
 *
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
-        set_mems_allowed(node_possible_map);
+        set_mems_allowed(node_states[N_HIGH_MEMORY]);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/list.h>
-#include <linux/slab.h>
 #include <linux/stacktrace.h>
 static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe49..54286798c37b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
+#include <linux/gfp.h>
 #include <asm/sections.h>
@@ -430,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
 /*
 * Various lockdep statistics:
 */
-atomic_t chain_lookup_hits;
+DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
-atomic_t chain_lookup_misses;
-atomic_t hardirqs_on_events;
-atomic_t hardirqs_off_events;
-atomic_t redundant_hardirqs_on;
-atomic_t redundant_hardirqs_off;
-atomic_t softirqs_on_events;
-atomic_t softirqs_off_events;
-atomic_t redundant_softirqs_on;
-atomic_t redundant_softirqs_off;
-atomic_t nr_unused_locks;
-atomic_t nr_cyclic_checks;
-atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_backwards_checks;
 #endif
 /*
@@ -582,9 +570,6 @@ static int static_obj(void *obj)
        unsigned long start = (unsigned long) &_stext,
                      end   = (unsigned long) &_end,
                      addr  = (unsigned long) obj;
-#ifdef CONFIG_SMP
-        int i;
-#endif
        /*
         * static variable?
@@ -595,24 +580,16 @@ static int static_obj(void *obj)
        if (arch_is_kernel_data(addr))
                return 1;
-#ifdef CONFIG_SMP
        /*
-         * percpu var?
+         * in-kernel percpu var?
         */
-        for_each_possible_cpu(i) {
+        if (is_kernel_percpu_address(addr))
-                start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+                return 1;
-                end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
-                                        + per_cpu_offset(i);
-                if ((addr >= start) && (addr < end))
-                        return 1;
-        }
-#endif
        /*
-         * module var?
+         * module static or percpu var?
         */
-        return is_module_address(addr);
+        return is_module_address(addr) || is_module_percpu_address(addr);
 }
 /*
@@ -758,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                return NULL;
        }
        class = lock_classes + nr_lock_classes++;
-        debug_atomic_inc(&nr_unused_locks);
+        debug_atomic_inc(nr_unused_locks);
        class->key = key;
        class->name = lock->name;
        class->subclass = subclass;
@@ -828,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
 * Add a new dependency to the head of the list:
 */
 static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
-                            struct list_head *head, unsigned long ip, int distance)
+                            struct list_head *head, unsigned long ip,
+                            int distance, struct stack_trace *trace)
 {
        struct lock_list *entry;
        /*
@@ -839,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        if (!entry)
                return 0;
-        if (!save_trace(&entry->trace))
-                return 0;
        entry->class = this;
        entry->distance = distance;
+        entry->trace = *trace;
        /*
         * Since we never remove from the dependency list, the list can
         * be walked lockless by other CPUs, it's only allocation
@@ -1215,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
 {
        int result;
-        debug_atomic_inc(&nr_cyclic_checks);
+        debug_atomic_inc(nr_cyclic_checks);
        result = __bfs_forwards(root, target, class_equal, target_entry);
@@ -1252,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
 {
        int result;
-        debug_atomic_inc(&nr_find_usage_forwards_checks);
+        debug_atomic_inc(nr_find_usage_forwards_checks);
        result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
@@ -1275,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
 {
        int result;
-        debug_atomic_inc(&nr_find_usage_backwards_checks);
+        debug_atomic_inc(nr_find_usage_backwards_checks);
        result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
@@ -1645,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-               struct held_lock *next, int distance)
+               struct held_lock *next, int distance, int trylock_loop)
 {
        struct lock_list *entry;
        int ret;
        struct lock_list this;
        struct lock_list *uninitialized_var(target_entry);
+        /*
+         * Static variable, serialized by the graph_lock().
+         *
+         * We use this static variable to save the stack trace in case
+         * we call into this function multiple times due to encountering
+         * trylocks in the held lock stack.
+         */
+        static struct stack_trace trace;
        /*
         * Prove that the new <prev> -> <next> dependency would not
@@ -1698,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                }
        }
+        if (!trylock_loop && !save_trace(&trace))
+                return 0;
        /*
         * Ok, all validations passed, add the new lock
         * to the previous lock's dependency list:
         */
        ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
                               &hlock_class(prev)->locks_after,
-                               next->acquire_ip, distance);
+                               next->acquire_ip, distance, &trace);
        if (!ret)
                return 0;
        ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
                               &hlock_class(next)->locks_before,
-                               next->acquire_ip, distance);
+                               next->acquire_ip, distance, &trace);
        if (!ret)
                return 0;
@@ -1741,6 +1728,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
        int depth = curr->lockdep_depth;
+        int trylock_loop = 0;
        struct held_lock *hlock;
        /*
@@ -1766,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                 * added:
                 */
                if (hlock->read != 2) {
-                        if (!check_prev_add(curr, hlock, next, distance))
+                        if (!check_prev_add(curr, hlock, next,
+                                                distance, trylock_loop))
                                return 0;
                        /*
                         * Stop after the first non-trylock entry,
@@ -1789,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                if (curr->held_locks[depth].irq_context !=
                                curr->held_locks[depth-1].irq_context)
                        break;
+                trylock_loop = 1;
        }
        return 1;
 out_bug:
@@ -1835,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        list_for_each_entry(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
 cache_hit:
-                        debug_atomic_inc(&chain_lookup_hits);
+                        debug_atomic_inc(chain_lookup_hits);
                        if (very_verbose(class))
                                printk("\nhash chain already cached, key: "
                                        "%016Lx tail class: [%p] %s\n",
@@ -1900,7 +1890,7 @@ cache_hit:
                chain_hlocks[chain->base + j] = class - lock_classes;
        }
        list_add_tail_rcu(&chain->entry, hash_head);
-        debug_atomic_inc(&chain_lookup_misses);
+        debug_atomic_inc(chain_lookup_misses);
        inc_chains();
        return 1;
@@ -2147,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
                return ret;
        return print_irq_inversion_bug(curr, &root, target_entry,
-                                        this, 1, irqclass);
+                                        this, 0, irqclass);
 }
 void print_irqtrace_events(struct task_struct *curr)
@@ -2321,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
                return;
        if (unlikely(curr->hardirqs_enabled)) {
-                debug_atomic_inc(&redundant_hardirqs_on);
+                /*
+                 * Neither irq nor preemption are disabled here
+                 * so this is racy by nature but loosing one hit
+                 * in a stat is not a big deal.
+                 */
+                __debug_atomic_inc(redundant_hardirqs_on);
                return;
        }
        /* we'll do an OFF -> ON transition: */
@@ -2348,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
        curr->hardirq_enable_ip = ip;
        curr->hardirq_enable_event = ++curr->irq_events;
-        debug_atomic_inc(&hardirqs_on_events);
+        debug_atomic_inc(hardirqs_on_events);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
@@ -2380,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
                curr->hardirqs_enabled = 0;
                curr->hardirq_disable_ip = ip;
                curr->hardirq_disable_event = ++curr->irq_events;
-                debug_atomic_inc(&hardirqs_off_events);
+                debug_atomic_inc(hardirqs_off_events);
        } else
-                debug_atomic_inc(&redundant_hardirqs_off);
+                debug_atomic_inc(redundant_hardirqs_off);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -2406,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
                return;
        if (curr->softirqs_enabled) {
-                debug_atomic_inc(&redundant_softirqs_on);
+                debug_atomic_inc(redundant_softirqs_on);
                return;
        }
@@ -2416,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
        curr->softirqs_enabled = 1;
        curr->softirq_enable_ip = ip;
        curr->softirq_enable_event = ++curr->irq_events;
-        debug_atomic_inc(&softirqs_on_events);
+        debug_atomic_inc(softirqs_on_events);
        /*
         * We are going to turn softirqs on, so set the
         * usage bit for all held locks, if hardirqs are
@@ -2446,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
                curr->softirqs_enabled = 0;
                curr->softirq_disable_ip = ip;
                curr->softirq_disable_event = ++curr->irq_events;
-                debug_atomic_inc(&softirqs_off_events);
+                debug_atomic_inc(softirqs_off_events);
                DEBUG_LOCKS_WARN_ON(!softirq_count());
        } else
-                debug_atomic_inc(&redundant_softirqs_off);
+                debug_atomic_inc(redundant_softirqs_off);
 }
 static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2654,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
                        return 0;
                break;
        case LOCK_USED:
-                debug_atomic_dec(&nr_unused_locks);
+                debug_atomic_dec(nr_unused_locks);
                break;
        default:
                if (!debug_locks_off_graph_unlock())
@@ -2716,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lockdep_init_map);
+struct lock_class_key __lockdep_no_validate__;
 /*
 * This gets called for every mutex_lock*()/spin_lock*() operation.
 * We maintain the dependency maps and validate the locking attempt:
@@ -2750,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                return 0;
        }
+        if (lock->key == &__lockdep_no_validate__)
+                check = 1;
        if (!subclass)
                class = lock->class_cache;
        /*
@@ -2760,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                if (!class)
                        return 0;
        }
-        debug_atomic_inc((atomic_t *)&class->ops);
+        atomic_inc((atomic_t *)&class->ops);
        if (very_verbose(class)) {
                printk("\nacquire class [%p] %s", class->key, class->name);
                if (class->name_version > 1)
@@ -3211,8 +3211,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
        unsigned long flags;
-        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        if (unlikely(current->lockdep_recursion))
                return;
@@ -3220,6 +3218,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        __lock_acquire(lock, subclass, trylock, read, check,
                       irqs_disabled_flags(flags), nest_lock, ip, 0);
        current->lockdep_recursion = 0;
@@ -3232,14 +3231,13 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
        unsigned long flags;
-        trace_lock_release(lock, nested, ip);
        if (unlikely(current->lockdep_recursion))
                return;
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_release(lock, ip);
        __lock_release(lock, nested, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3392,7 +3390,7 @@ found_it:
                hlock->holdtime_stamp = now;
        }
-        trace_lock_acquired(lock, ip, waittime);
+        trace_lock_acquired(lock, ip);
        stats = get_lock_stats(hlock_class(hlock));
        if (waittime) {
@@ -3413,8 +3411,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
-        trace_lock_contended(lock, ip);
        if (unlikely(!lock_stat))
                return;
@@ -3424,6 +3420,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_contended(lock, ip);
        __lock_contended(lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3809,3 +3806,25 @@ void lockdep_sys_exit(void)
                lockdep_print_held_locks(curr);
        }
 }
+void lockdep_rcu_dereference(const char *file, const int line)
+{
+        struct task_struct *curr = current;
+#ifndef CONFIG_PROVE_RCU_REPEATEDLY
+        if (!debug_locks_off())
+                return;
+#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
+        /* Note: the following can be executed concurrently, so be careful. */
+        printk("\n===================================================\n");
+        printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
+        printk(  "---------------------------------------------------\n");
+        printk("%s:%d invoked rcu_dereference_check() without protection!\n",
+                        file, line);
+        printk("\nother info that might help us debug this:\n\n");
+        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+        lockdep_print_held_locks(curr);
+        printk("\nstack backtrace:\n");
+        dump_stack();
+}
+EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
 #endif
 #ifdef CONFIG_DEBUG_LOCKDEP
+#include <asm/local.h>
 /*
- * Various lockdep statistics:
+ * Various lockdep statistics.
+ * We want them per cpu as they are often accessed in fast path
+ * and we want to avoid too much cache bouncing.
 */
-extern atomic_t chain_lookup_hits;
+struct lockdep_stats {
-extern atomic_t chain_lookup_misses;
+        int     chain_lookup_hits;
-extern atomic_t hardirqs_on_events;
+        int     chain_lookup_misses;
-extern atomic_t hardirqs_off_events;
+        int     hardirqs_on_events;
-extern atomic_t redundant_hardirqs_on;
+        int     hardirqs_off_events;
-extern atomic_t redundant_hardirqs_off;
+        int     redundant_hardirqs_on;
-extern atomic_t softirqs_on_events;
+        int     redundant_hardirqs_off;
-extern atomic_t softirqs_off_events;
+        int     softirqs_on_events;
-extern atomic_t redundant_softirqs_on;
+        int     softirqs_off_events;
-extern atomic_t redundant_softirqs_off;
+        int     redundant_softirqs_on;
-extern atomic_t nr_unused_locks;
+        int     redundant_softirqs_off;
-extern atomic_t nr_cyclic_checks;
+        int     nr_unused_locks;
-extern atomic_t nr_cyclic_check_recursions;
+        int     nr_cyclic_checks;
-extern atomic_t nr_find_usage_forwards_checks;
+        int     nr_cyclic_check_recursions;
-extern atomic_t nr_find_usage_forwards_recursions;
+        int     nr_find_usage_forwards_checks;
-extern atomic_t nr_find_usage_backwards_checks;
+        int     nr_find_usage_forwards_recursions;
-extern atomic_t nr_find_usage_backwards_recursions;
+        int     nr_find_usage_backwards_checks;
-# define debug_atomic_inc(ptr)          atomic_inc(ptr)
+        int     nr_find_usage_backwards_recursions;
-# define debug_atomic_dec(ptr)          atomic_dec(ptr)
+};
-# define debug_atomic_read(ptr)         atomic_read(ptr)
+DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
+#define __debug_atomic_inc(ptr)                                 \
+        this_cpu_inc(lockdep_stats.ptr);
+#define debug_atomic_inc(ptr)                   {               \
+        WARN_ON_ONCE(!irqs_disabled());                         \
+        __this_cpu_inc(lockdep_stats.ptr);                      \
+}
+#define debug_atomic_dec(ptr)                   {               \
+        WARN_ON_ONCE(!irqs_disabled());                         \
+        __this_cpu_dec(lockdep_stats.ptr);                      \
+}
+#define debug_atomic_read(ptr)          ({                              \
+        struct lockdep_stats *__cpu_lockdep_stats;                      \
+        unsigned long long __total = 0;                                 \
+        int __cpu;                                                      \
+        for_each_possible_cpu(__cpu) {                                  \
+                __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu);   \
+                __total += __cpu_lockdep_stats->ptr;                    \
+        }                                                               \
+        __total;                                                        \
+})
 #else
+# define __debug_atomic_inc(ptr)        do { } while (0)
 # define debug_atomic_inc(ptr)          do { } while (0)
 # define debug_atomic_dec(ptr)          do { } while (0)
 # define debug_atomic_read(ptr)         0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
-        unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
+        unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
-                     hi2 = debug_atomic_read(&hardirqs_off_events),
+                           hi2 = debug_atomic_read(hardirqs_off_events),
-                     hr1 = debug_atomic_read(&redundant_hardirqs_on),
+                           hr1 = debug_atomic_read(redundant_hardirqs_on),
-                     hr2 = debug_atomic_read(&redundant_hardirqs_off),
+                           hr2 = debug_atomic_read(redundant_hardirqs_off),
-                     si1 = debug_atomic_read(&softirqs_on_events),
+                           si1 = debug_atomic_read(softirqs_on_events),
-                     si2 = debug_atomic_read(&softirqs_off_events),
+                           si2 = debug_atomic_read(softirqs_off_events),
-                     sr1 = debug_atomic_read(&redundant_softirqs_on),
+                           sr1 = debug_atomic_read(redundant_softirqs_on),
-                     sr2 = debug_atomic_read(&redundant_softirqs_off);
+                           sr2 = debug_atomic_read(redundant_softirqs_off);
-        seq_printf(m, " chain lookup misses:           %11u\n",
+        seq_printf(m, " chain lookup misses:           %11llu\n",
-                debug_atomic_read(&chain_lookup_misses));
+                debug_atomic_read(chain_lookup_misses));
-        seq_printf(m, " chain lookup hits:             %11u\n",
+        seq_printf(m, " chain lookup hits:             %11llu\n",
-                debug_atomic_read(&chain_lookup_hits));
+                debug_atomic_read(chain_lookup_hits));
-        seq_printf(m, " cyclic checks:                 %11u\n",
+        seq_printf(m, " cyclic checks:                 %11llu\n",
-                debug_atomic_read(&nr_cyclic_checks));
+                debug_atomic_read(nr_cyclic_checks));
-        seq_printf(m, " find-mask forwards checks:     %11u\n",
+        seq_printf(m, " find-mask forwards checks:     %11llu\n",
-                debug_atomic_read(&nr_find_usage_forwards_checks));
+                debug_atomic_read(nr_find_usage_forwards_checks));
-        seq_printf(m, " find-mask backwards checks:    %11u\n",
+        seq_printf(m, " find-mask backwards checks:    %11llu\n",
-                debug_atomic_read(&nr_find_usage_backwards_checks));
+                debug_atomic_read(nr_find_usage_backwards_checks));
-        seq_printf(m, " hardirq on events:             %11u\n", hi1);
+        seq_printf(m, " hardirq on events:             %11llu\n", hi1);
-        seq_printf(m, " hardirq off events:            %11u\n", hi2);
+        seq_printf(m, " hardirq off events:            %11llu\n", hi2);
-        seq_printf(m, " redundant hardirq ons:         %11u\n", hr1);
+        seq_printf(m, " redundant hardirq ons:         %11llu\n", hr1);
-        seq_printf(m, " redundant hardirq offs:        %11u\n", hr2);
+        seq_printf(m, " redundant hardirq offs:        %11llu\n", hr2);
-        seq_printf(m, " softirq on events:             %11u\n", si1);
+        seq_printf(m, " softirq on events:             %11llu\n", si1);
-        seq_printf(m, " softirq off events:            %11u\n", si2);
+        seq_printf(m, " softirq off events:            %11llu\n", si2);
-        seq_printf(m, " redundant softirq ons:         %11u\n", sr1);
+        seq_printf(m, " redundant softirq ons:         %11llu\n", sr1);
-        seq_printf(m, " redundant softirq offs:        %11u\n", sr2);
+        seq_printf(m, " redundant softirq offs:        %11llu\n", sr2);
 #endif
 }
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 #endif
        }
 #ifdef CONFIG_DEBUG_LOCKDEP
-        DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+        DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
 #endif
        seq_printf(m, " lock-classes:                  %11lu [max: %lu]\n",
                        nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index 5daf0abd63c1..6c562828c85c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
-EXPORT_TRACEPOINT_SYMBOL(module_get);
 #if 0
 #define DEBUGP printk
 #else
@@ -74,11 +72,19 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
 /* If this is set, the section belongs in the init part of the module */
 #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
-/* List of modules, protected by module_mutex or preempt_disable
+/*
+ * Mutex protects:
+ * 1) List of modules (also safely readable with preempt_disable),
+ * 2) module_use links,
+ * 3) module_addr_min/module_addr_max.
 * (delete uses stop_machine/add uses RCU list operations). */
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+#ifdef CONFIG_KGDB_KDB
+struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+#endif /* CONFIG_KGDB_KDB */
 /* Block module loading/unloading? */
 int modules_disabled = 0;
@@ -88,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
-/* Bounds of module allocation, for speeding __module_address */
+/* Bounds of module allocation, for speeding __module_address.
+ * Protected by module_mutex. */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 int register_module_notifier(struct notifier_block * nb)
@@ -178,8 +185,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_gpl_future[];
-extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
@@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 }
 /* Find a symbol and return it, along with, (optional) crc and
- * (optional) module which owns it */
+ * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
 const struct kernel_symbol *find_symbol(const char *name,
                                        struct module **owner,
                                        const unsigned long **crc,
@@ -370,27 +375,33 @@ EXPORT_SYMBOL_GPL(find_module);
 #ifdef CONFIG_SMP
-static void *percpu_modalloc(unsigned long size, unsigned long align,
+static inline void __percpu *mod_percpu(struct module *mod)
-                             const char *name)
 {
-        void *ptr;
+        return mod->percpu;
+}
+static int percpu_modalloc(struct module *mod,
+                           unsigned long size, unsigned long align)
+{
        if (align > PAGE_SIZE) {
                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                       name, align, PAGE_SIZE);
+                       mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }
-        ptr = __alloc_reserved_percpu(size, align);
+        mod->percpu = __alloc_reserved_percpu(size, align);
-        if (!ptr)
+        if (!mod->percpu) {
                printk(KERN_WARNING
                       "Could not allocate %lu bytes percpu data\n", size);
-        return ptr;
+                return -ENOMEM;
+        }
+        mod->percpu_size = size;
+        return 0;
 }
-static void percpu_modfree(void *freeme)
+static void percpu_modfree(struct module *mod)
 {
-        free_percpu(freeme);
+        free_percpu(mod->percpu);
 }
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
@@ -400,24 +411,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
        return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
 }
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+static void percpu_modcopy(struct module *mod,
+                           const void *from, unsigned long size)
 {
        int cpu;
        for_each_possible_cpu(cpu)
-                memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+        struct module *mod;
+        unsigned int cpu;
+        preempt_disable();
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if (!mod->percpu_size)
+                        continue;
+                for_each_possible_cpu(cpu) {
+                        void *start = per_cpu_ptr(mod->percpu, cpu);
+                        if ((void *)addr >= start &&
+                            (void *)addr < start + mod->percpu_size) {
+                                preempt_enable();
+                                return true;
+                        }
+                }
+        }
+        preempt_enable();
+        return false;
 }
 #else /* ... !CONFIG_SMP */
-static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+static inline void __percpu *mod_percpu(struct module *mod)
-                                    const char *name)
 {
        return NULL;
 }
-static inline void percpu_modfree(void *pcpuptr)
+static inline int percpu_modalloc(struct module *mod,
+                                  unsigned long size, unsigned long align)
+{
+        return -ENOMEM;
+}
+static inline void percpu_modfree(struct module *mod)
 {
-        BUG();
 }
 static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                        Elf_Shdr *sechdrs,
@@ -425,12 +474,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
 {
        return 0;
 }
-static inline void percpu_modcopy(void *pcpudst, const void *src,
+static inline void percpu_modcopy(struct module *mod,
-                                  unsigned long size)
+                                  const void *from, unsigned long size)
 {
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
 }
+bool is_module_percpu_address(unsigned long addr)
+{
+        return false;
+}
 #endif /* CONFIG_SMP */
@@ -467,34 +520,34 @@ MODINFO_ATTR(srcversion);
 static char last_unloaded_module[MODULE_NAME_LEN+1];
 #ifdef CONFIG_MODULE_UNLOAD
+EXPORT_TRACEPOINT_SYMBOL(module_get);
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
        int cpu;
-        INIT_LIST_HEAD(&mod->modules_which_use_me);
+        INIT_LIST_HEAD(&mod->source_list);
-        for_each_possible_cpu(cpu)
+        INIT_LIST_HEAD(&mod->target_list);
-                local_set(__module_ref_addr(mod, cpu), 0);
+        for_each_possible_cpu(cpu) {
+                per_cpu_ptr(mod->refptr, cpu)->incs = 0;
+                per_cpu_ptr(mod->refptr, cpu)->decs = 0;
+        }
        /* Hold reference count during initialization. */
-        local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
+        __this_cpu_write(mod->refptr->incs, 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
 }
-/* modules using other modules */
-struct module_use
-{
-        struct list_head list;
-        struct module *module_which_uses;
-};
 /* Does a already use b? */
 static int already_uses(struct module *a, struct module *b)
 {
        struct module_use *use;
-        list_for_each_entry(use, &b->modules_which_use_me, list) {
+        list_for_each_entry(use, &b->source_list, source_list) {
-                if (use->module_which_uses == a) {
+                if (use->source == a) {
                        DEBUGP("%s uses %s!\n", a->name, b->name);
                        return 1;
                }
@@ -503,62 +556,68 @@ static int already_uses(struct module *a, struct module *b)
        return 0;
 }
-/* Module a uses b */
+/*
-int use_module(struct module *a, struct module *b)
+ * Module a uses b
+ *  - we add 'a' as a "source", 'b' as a "target" of module use
+ *  - the module_use is added to the list of 'b' sources (so
+ *    'b' can walk the list to see who sourced them), and of 'a'
+ *    targets (so 'a' can see what modules it targets).
+ */
+static int add_module_usage(struct module *a, struct module *b)
 {
        struct module_use *use;
-        int no_warn, err;
-        if (b == NULL || already_uses(a, b)) return 1;
+        DEBUGP("Allocating new usage for %s.\n", a->name);
+        use = kmalloc(sizeof(*use), GFP_ATOMIC);
+        if (!use) {
+                printk(KERN_WARNING "%s: out of memory loading\n", a->name);
+                return -ENOMEM;
+        }
-        /* If we're interrupted or time out, we fail. */
+        use->source = a;
-        if (wait_event_interruptible_timeout(
+        use->target = b;
-                    module_wq, (err = strong_try_module_get(b)) != -EBUSY,
+        list_add(&use->source_list, &b->source_list);
-                    30 * HZ) <= 0) {
+        list_add(&use->target_list, &a->target_list);
-                printk("%s: gave up waiting for init of module %s.\n",
+        return 0;
-                       a->name, b->name);
+}
+/* Module a uses b: caller needs module_mutex() */
+int ref_module(struct module *a, struct module *b)
+{
+        int err;
+        if (b == NULL || already_uses(a, b))
                return 0;
-        }
-        /* If strong_try_module_get() returned a different error, we fail. */
+        /* If module isn't available, we fail. */
+        err = strong_try_module_get(b);
        if (err)
-                return 0;
+                return err;
-        DEBUGP("Allocating new usage for %s.\n", a->name);
+        err = add_module_usage(a, b);
-        use = kmalloc(sizeof(*use), GFP_ATOMIC);
+        if (err) {
-        if (!use) {
-                printk("%s: out of memory loading\n", a->name);
                module_put(b);
-                return 0;
+                return err;
        }
+        return 0;
-        use->module_which_uses = a;
-        list_add(&use->list, &b->modules_which_use_me);
-        no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
-        return 1;
 }
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
 {
-        struct module *i;
+        struct module_use *use, *tmp;
-        list_for_each_entry(i, &modules, list) {
-                struct module_use *use;
-                list_for_each_entry(use, &i->modules_which_use_me, list) {
+        mutex_lock(&module_mutex);
-                        if (use->module_which_uses == mod) {
+        list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
-                                DEBUGP("%s unusing %s\n", mod->name, i->name);
+                struct module *i = use->target;
-                                module_put(i);
+                DEBUGP("%s unusing %s\n", mod->name, i->name);
-                                list_del(&use->list);
+                module_put(i);
-                                kfree(use);
+                list_del(&use->source_list);
-                                sysfs_remove_link(i->holders_dir, mod->name);
+                list_del(&use->target_list);
-                                /* There can be at most one match. */
+                kfree(use);
-                                break;
-                        }
-                }
        }
+        mutex_unlock(&module_mutex);
 }
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -615,12 +674,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 unsigned int module_refcount(struct module *mod)
 {
-        unsigned int total = 0;
+        unsigned int incs = 0, decs = 0;
        int cpu;
        for_each_possible_cpu(cpu)
-                total += local_read(__module_ref_addr(mod, cpu));
+                decs += per_cpu_ptr(mod->refptr, cpu)->decs;
-        return total;
+        /*
+         * ensure the incs are added up after the decs.
+         * module_put ensures incs are visible before decs with smp_wmb.
+         *
+         * This 2-count scheme avoids the situation where the refcount
+         * for CPU0 is read, then CPU0 increments the module refcount,
+         * then CPU1 drops that refcount, then the refcount for CPU1 is
+         * read. We would record a decrement but not its corresponding
+         * increment so we would see a low count (disaster).
+         *
+         * Rare situation? But module_refcount can be preempted, and we
+         * might be tallying up 4096+ CPUs. So it is not impossible.
+         */
+        smp_rmb();
+        for_each_possible_cpu(cpu)
+                incs += per_cpu_ptr(mod->refptr, cpu)->incs;
+        return incs - decs;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -656,16 +731,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
-        /* Create stop_machine threads since free_module relies on
+        if (mutex_lock_interruptible(&module_mutex) != 0)
-         * a non-failing stop_machine call. */
+                return -EINTR;
-        ret = stop_machine_create();
-        if (ret)
-                return ret;
-        if (mutex_lock_interruptible(&module_mutex) != 0) {
-                ret = -EINTR;
-                goto out_stop;
-        }
        mod = find_module(name);
        if (!mod) {
@@ -673,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                goto out;
        }
-        if (!list_empty(&mod->modules_which_use_me)) {
+        if (!list_empty(&mod->source_list)) {
                /* Other modules depend on us: get rid of them first. */
                ret = -EWOULDBLOCK;
                goto out;
@@ -717,16 +784,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
        async_synchronize_full();
-        mutex_lock(&module_mutex);
        /* Store the name of the last unloaded module for diagnostic purposes */
        strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-        ddebug_remove_module(mod->name);
-        free_module(mod);
- out:
+        free_module(mod);
+        return 0;
+out:
        mutex_unlock(&module_mutex);
-out_stop:
-        stop_machine_destroy();
        return ret;
 }
@@ -739,9 +804,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
        /* Always include a trailing , so userspace can differentiate
           between this and the old multi-field proc format. */
-        list_for_each_entry(use, &mod->modules_which_use_me, list) {
+        list_for_each_entry(use, &mod->source_list, source_list) {
                printed_something = 1;
-                seq_printf(m, "%s,", use->module_which_uses->name);
+                seq_printf(m, "%s,", use->source->name);
        }
        if (mod->init != NULL && mod->exit == NULL) {
@@ -796,14 +861,15 @@ static struct module_attribute refcnt = {
 void module_put(struct module *module)
 {
        if (module) {
-                unsigned int cpu = get_cpu();
+                preempt_disable();
-                local_dec(__module_ref_addr(module, cpu));
+                smp_wmb(); /* see comment in module_refcount */
-                trace_module_put(module, _RET_IP_,
+                __this_cpu_inc(module->refptr->decs);
-                                 local_read(__module_ref_addr(module, cpu)));
+                trace_module_put(module, _RET_IP_);
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
-                put_cpu();
+                preempt_enable();
        }
 }
 EXPORT_SYMBOL(module_put);
@@ -819,11 +885,11 @@ static inline void module_unload_free(struct module *mod)
 {
 }
-int use_module(struct module *a, struct module *b)
+int ref_module(struct module *a, struct module *b)
 {
-        return strong_try_module_get(b) == 0;
+        return strong_try_module_get(b);
 }
-EXPORT_SYMBOL_GPL(use_module);
+EXPORT_SYMBOL_GPL(ref_module);
 static inline void module_unload_init(struct module *mod)
 {
@@ -940,6 +1006,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
        const unsigned long *crc;
+        /* Since this should be found in kernel (which can't be removed),
+         * no locking is necessary. */
        if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
                         &crc, true, false))
                BUG();
@@ -982,29 +1050,62 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 }
 #endif /* CONFIG_MODVERSIONS */
-/* Resolve a symbol for this module.  I.e. if we find one, record usage.
+/* Resolve a symbol for this module.  I.e. if we find one, record usage. */
-   Must be holding module_mutex. */
 static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
                                                  unsigned int versindex,
                                                  const char *name,
-                                                  struct module *mod)
+                                                  struct module *mod,
+                                                  char ownername[])
 {
        struct module *owner;
        const struct kernel_symbol *sym;
        const unsigned long *crc;
+        int err;
+        mutex_lock(&module_mutex);
        sym = find_symbol(name, &owner, &crc,
                          !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-        /* use_module can fail due to OOM,
+        if (!sym)
-           or module initialization or unloading */
+                goto unlock;
-        if (sym) {
-                if (!check_version(sechdrs, versindex, name, mod, crc, owner)
+        if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
-                    || !use_module(mod, owner))
+                sym = ERR_PTR(-EINVAL);
-                        sym = NULL;
+                goto getname;
+        }
+        err = ref_module(mod, owner);
+        if (err) {
+                sym = ERR_PTR(err);
+                goto getname;
        }
+getname:
+        /* We must make copy under the lock if we failed to get ref. */
+        strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+unlock:
+        mutex_unlock(&module_mutex);
        return sym;
 }
+static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
+                                                       unsigned int versindex,
+                                                       const char *name,
+                                                       struct module *mod)
+{
+        const struct kernel_symbol *ksym;
+        char ownername[MODULE_NAME_LEN];
+        if (wait_event_interruptible_timeout(module_wq,
+                        !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
+                                                      mod, ownername)) ||
+                        PTR_ERR(ksym) != -EBUSY,
+                                             30 * HZ) <= 0) {
+                printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
+                       mod->name, ownername);
+        }
+        return ksym;
+}
 /*
 * /sys/module/foo/sections stuff
 * J. Corbet <corbet@lwn.net>
@@ -1083,6 +1184,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
                if (sattr->name == NULL)
                        goto out;
                sect_attrs->nsections++;
+                sysfs_attr_init(&sattr->mattr.attr);
                sattr->mattr.show = module_sect_show;
                sattr->mattr.store = NULL;
                sattr->mattr.attr.name = sattr->name;
@@ -1122,7 +1224,7 @@ struct module_notes_attrs {
        struct bin_attribute attrs[0];
 };
-static ssize_t module_notes_read(struct kobject *kobj,
+static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
                                 struct bin_attribute *bin_attr,
                                 char *buf, loff_t pos, size_t count)
 {
@@ -1178,6 +1280,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
                if (sect_empty(&sechdrs[i]))
                        continue;
                if (sechdrs[i].sh_type == SHT_NOTE) {
+                        sysfs_bin_attr_init(nattr);
                        nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
                        nattr->attr.mode = S_IRUGO;
                        nattr->size = sechdrs[i].sh_size;
@@ -1232,7 +1335,34 @@ static inline void remove_notes_attrs(struct module *mod)
 #endif
 #ifdef CONFIG_SYSFS
-int module_add_modinfo_attrs(struct module *mod)
+static void add_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+        struct module_use *use;
+        int nowarn;
+        mutex_lock(&module_mutex);
+        list_for_each_entry(use, &mod->target_list, target_list) {
+                nowarn = sysfs_create_link(use->target->holders_dir,
+                                           &mod->mkobj.kobj, mod->name);
+        }
+        mutex_unlock(&module_mutex);
+#endif
+}
+static void del_usage_links(struct module *mod)
+{
+#ifdef CONFIG_MODULE_UNLOAD
+        struct module_use *use;
+        mutex_lock(&module_mutex);
+        list_for_each_entry(use, &mod->target_list, target_list)
+                sysfs_remove_link(use->target->holders_dir, mod->name);
+        mutex_unlock(&module_mutex);
+#endif
+}
+static int module_add_modinfo_attrs(struct module *mod)
 {
        struct module_attribute *attr;
        struct module_attribute *temp_attr;
@@ -1250,6 +1380,7 @@ int module_add_modinfo_attrs(struct module *mod)
                if (!attr->test ||
                    (attr->test && attr->test(mod))) {
                        memcpy(temp_attr, attr, sizeof(*temp_attr));
+                        sysfs_attr_init(&temp_attr->attr);
                        error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
                        ++temp_attr;
                }
@@ -1257,7 +1388,7 @@ int module_add_modinfo_attrs(struct module *mod)
        return error;
 }
-void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod)
 {
        struct module_attribute *attr;
        int i;
@@ -1273,7 +1404,7 @@ void module_remove_modinfo_attrs(struct module *mod)
        kfree(mod->modinfo_attrs);
 }
-int mod_sysfs_init(struct module *mod)
+static int mod_sysfs_init(struct module *mod)
 {
        int err;
        struct kobject *kobj;
@@ -1307,12 +1438,16 @@ out:
        return err;
 }
-int mod_sysfs_setup(struct module *mod,
+static int mod_sysfs_setup(struct module *mod,
                           struct kernel_param *kparam,
                           unsigned int num_params)
 {
        int err;
+        err = mod_sysfs_init(mod);
+        if (err)
+                goto out;
        mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
        if (!mod->holders_dir) {
                err = -ENOMEM;
@@ -1327,6 +1462,8 @@ int mod_sysfs_setup(struct module *mod,
        if (err)
                goto out_unreg_param;
+        add_usage_links(mod);
        kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
        return 0;
@@ -1336,6 +1473,7 @@ out_unreg_holders:
        kobject_put(mod->holders_dir);
 out_unreg:
        kobject_put(&mod->mkobj.kobj);
+out:
        return err;
 }
@@ -1346,14 +1484,40 @@ static void mod_sysfs_fini(struct module *mod)
 #else /* CONFIG_SYSFS */
+static inline int mod_sysfs_init(struct module *mod)
+{
+        return 0;
+}
+static inline int mod_sysfs_setup(struct module *mod,
+                           struct kernel_param *kparam,
+                           unsigned int num_params)
+{
+        return 0;
+}
+static inline int module_add_modinfo_attrs(struct module *mod)
+{
+        return 0;
+}
+static inline void module_remove_modinfo_attrs(struct module *mod)
+{
+}
 static void mod_sysfs_fini(struct module *mod)
 {
 }
+static void del_usage_links(struct module *mod)
+{
+}
 #endif /* CONFIG_SYSFS */
 static void mod_kobject_remove(struct module *mod)
 {
+        del_usage_links(mod);
        module_remove_modinfo_attrs(mod);
        module_param_sysfs_remove(mod);
        kobject_put(mod->mkobj.drivers_dir);
@@ -1372,17 +1536,22 @@ static int __unlink_module(void *_mod)
        return 0;
 }
-/* Free a module, remove from lists, etc (must hold module_mutex). */
+/* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
        trace_module_free(mod);
        /* Delete from various lists */
+        mutex_lock(&module_mutex);
        stop_machine(__unlink_module, mod, NULL);
+        mutex_unlock(&module_mutex);
        remove_notes_attrs(mod);
        remove_sect_attrs(mod);
        mod_kobject_remove(mod);
+        /* Remove dynamic debug info */
+        ddebug_remove_module(mod->name);
        /* Arch-specific cleanup. */
        module_arch_cleanup(mod);
@@ -1395,11 +1564,10 @@ static void free_module(struct module *mod)
        /* This may be NULL, but that's OK */
        module_free(mod, mod->module_init);
        kfree(mod->args);
-        if (mod->percpu)
+        percpu_modfree(mod);
-                percpu_modfree(mod->percpu);
+#if defined(CONFIG_MODULE_UNLOAD)
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
        if (mod->refptr)
-                percpu_modfree(mod->refptr);
+                free_percpu(mod->refptr);
 #endif
        /* Free lock-classes: */
        lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1430,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
 /*
 * Ensure that an exported symbol [global namespace] does not already exist
 * in the kernel or in some other module's exported symbol table.
+ *
+ * You must hold the module_mutex.
 */
 static int verify_export_symbols(struct module *mod)
 {
@@ -1495,27 +1665,29 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
                        break;
                case SHN_UNDEF:
-                        ksym = resolve_symbol(sechdrs, versindex,
+                        ksym = resolve_symbol_wait(sechdrs, versindex,
-                                              strtab + sym[i].st_name, mod);
+                                                   strtab + sym[i].st_name,
+                                                   mod);
                        /* Ok if resolved.  */
-                        if (ksym) {
+                        if (ksym && !IS_ERR(ksym)) {
                                sym[i].st_value = ksym->value;
                                break;
                        }
                        /* Ok if weak.  */
-                        if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+                        if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
                                break;
-                        printk(KERN_WARNING "%s: Unknown symbol %s\n",
+                        printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
-                               mod->name, strtab + sym[i].st_name);
+                               mod->name, strtab + sym[i].st_name,
-                        ret = -ENOENT;
+                               PTR_ERR(ksym));
+                        ret = PTR_ERR(ksym) ?: -ENOENT;
                        break;
                default:
                        /* Divert to percpu allocation if a percpu var. */
                        if (sym[i].st_shndx == pcpuindex)
-                                secbase = (unsigned long)mod->percpu;
+                                secbase = (unsigned long)mod_percpu(mod);
                        else
                                secbase = sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
@@ -1892,16 +2064,24 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 #endif
 }
+static void dynamic_debug_remove(struct _ddebug *debug)
+{
+        if (debug)
+                ddebug_remove_module(debug->modname);
+}
 static void *module_alloc_update_bounds(unsigned long size)
 {
        void *ret = module_alloc(size);
        if (ret) {
+                mutex_lock(&module_mutex);
                /* Update module bounds. */
                if ((unsigned long)ret < module_addr_min)
                        module_addr_min = (unsigned long)ret;
                if ((unsigned long)ret + size > module_addr_max)
                        module_addr_max = (unsigned long)ret + size;
+                mutex_unlock(&module_mutex);
        }
        return ret;
 }
@@ -1949,8 +2129,11 @@ static noinline struct module *load_module(void __user *umod,
        unsigned int modindex, versindex, infoindex, pcpuindex;
        struct module *mod;
        long err = 0;
-        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+        void *ptr = NULL; /* Stops spurious gcc warning */
        unsigned long symoffs, stroffs, *strmap;
+        void __percpu *percpu;
+        struct _ddebug *debug = NULL;
+        unsigned int num_debug = 0;
        mm_segment_t old_fs;
@@ -2075,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod,
                goto free_mod;
        }
-        if (find_module(mod->name)) {
-                err = -EEXIST;
-                goto free_mod;
-        }
        mod->state = MODULE_STATE_COMING;
        /* Allow arches to frob section contents and sizes.  */
@@ -2089,16 +2267,14 @@ static noinline struct module *load_module(void __user *umod,
        if (pcpuindex) {
                /* We have a special allocation for this section. */
-                percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+                err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
-                                         sechdrs[pcpuindex].sh_addralign,
+                                      sechdrs[pcpuindex].sh_addralign);
-                                         mod->name);
+                if (err)
-                if (!percpu) {
-                        err = -ENOMEM;
                        goto free_mod;
-                }
                sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-                mod->percpu = percpu;
        }
+        /* Keep this around for failure path. */
+        percpu = mod_percpu(mod);
        /* Determine total sizes, and put offsets in sh_entsize.  For now
           this is done generically; there doesn't appear to be any
@@ -2162,9 +2338,8 @@ static noinline struct module *load_module(void __user *umod,
        mod = (void *)sechdrs[modindex].sh_addr;
        kmemleak_load_module(mod, hdr, sechdrs, secstrings);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+        mod->refptr = alloc_percpu(struct module_ref);
-                                      mod->name);
        if (!mod->refptr) {
                err = -ENOMEM;
                goto free_init;
@@ -2173,11 +2348,6 @@ static noinline struct module *load_module(void __user *umod,
        /* Now we've moved module, initialize linked lists, etc. */
        module_unload_init(mod);
-        /* add kobject, so we can reference it. */
-        err = mod_sysfs_init(mod);
-        if (err)
-                goto free_unload;
        /* Set up license info based on the info section */
        set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
@@ -2302,18 +2472,13 @@ static noinline struct module *load_module(void __user *umod,
                        goto cleanup;
        }
-        /* Find duplicate symbols */
-        err = verify_export_symbols(mod);
-        if (err < 0)
-                goto cleanup;
        /* Set up and sort exception table */
        mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
                                    sizeof(*mod->extable), &mod->num_exentries);
        sort_extable(mod->extable, mod->extable + mod->num_exentries);
        /* Finally, copy percpu area over. */
-        percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+        percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
                       sechdrs[pcpuindex].sh_size);
        add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2321,15 +2486,9 @@ static noinline struct module *load_module(void __user *umod,
        kfree(strmap);
        strmap = NULL;
-        if (!mod->taints) {
+        if (!mod->taints)
-                struct _ddebug *debug;
-                unsigned int num_debug;
                debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
                                     sizeof(*debug), &num_debug);
-                if (debug)
-                        dynamic_debug_setup(debug, num_debug);
-        }
        err = module_finalize(hdr, sechdrs, mod);
        if (err < 0)
@@ -2365,7 +2524,22 @@ static noinline struct module *load_module(void __user *umod,
         * function to insert in a way safe to concurrent readers.
         * The mutex protects against concurrent writers.
         */
+        mutex_lock(&module_mutex);
+        if (find_module(mod->name)) {
+                err = -EEXIST;
+                goto unlock;
+        }
+        if (debug)
+                dynamic_debug_setup(debug, num_debug);
+        /* Find duplicate symbols */
+        err = verify_export_symbols(mod);
+        if (err < 0)
+                goto ddebug;
        list_add_rcu(&mod->list, &modules);
+        mutex_unlock(&module_mutex);
        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
        if (err < 0)
@@ -2374,6 +2548,7 @@ static noinline struct module *load_module(void __user *umod,
        err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
        if (err < 0)
                goto unlink;
        add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
        add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2386,18 +2561,20 @@ static noinline struct module *load_module(void __user *umod,
        return mod;
 unlink:
+        mutex_lock(&module_mutex);
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
+ ddebug:
+        dynamic_debug_remove(debug);
+ unlock:
+        mutex_unlock(&module_mutex);
        synchronize_sched();
        module_arch_cleanup(mod);
 cleanup:
        free_modinfo(mod);
-        kobject_del(&mod->mkobj.kobj);
-        kobject_put(&mod->mkobj.kobj);
- free_unload:
        module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        percpu_modfree(mod->refptr);
+        free_percpu(mod->refptr);
 free_init:
 #endif
        module_free(mod, mod->module_init);
@@ -2405,8 +2582,7 @@ static noinline struct module *load_module(void __user *umod,
        module_free(mod, mod->module_core);
        /* mod will be freed with core. Don't access it beyond this line! */
 free_percpu:
-        if (percpu)
+        free_percpu(percpu);
-                percpu_modfree(percpu);
 free_mod:
        kfree(args);
        kfree(strmap);
@@ -2442,19 +2618,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;
-        /* Only one module load at a time, please */
-        if (mutex_lock_interruptible(&module_mutex) != 0)
-                return -EINTR;
        /* Do all the hard work */
        mod = load_module(umod, len, uargs);
-        if (IS_ERR(mod)) {
+        if (IS_ERR(mod))
-                mutex_unlock(&module_mutex);
                return PTR_ERR(mod);
-        }
-        /* Drop lock so they can recurse */
-        mutex_unlock(&module_mutex);
        blocking_notifier_call_chain(&module_notify_list,
                        MODULE_STATE_COMING, mod);
@@ -2471,9 +2638,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
                module_put(mod);
                blocking_notifier_call_chain(&module_notify_list,
                                             MODULE_STATE_GOING, mod);
-                mutex_lock(&module_mutex);
                free_module(mod);
-                mutex_unlock(&module_mutex);
                wake_up(&module_wq);
                return ret;
        }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d82..4c0b7b3e6d2e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                struct thread_info *owner;
                /*
+                 * If we own the BKL, then don't spin. The owner of
+                 * the mutex might be waiting on us to release the BKL.
+                 */
+                if (unlikely(current->lock_depth >= 0))
+                        break;
+                /*
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;
-        nb = rcu_dereference(*nl);
+        nb = rcu_dereference_raw(*nl);
        while (nb && nr_to_call) {
-                next_nb = rcu_dereference(nb->next);
+                next_nb = rcu_dereference_raw(nb->next);
 #ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
-        if (rcu_dereference(nh->head)) {
+        if (rcu_dereference_raw(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
                                        nr_calls);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
 *             Pavel Emelianov <xemul@openvz.org>
 */
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
@@ -24,7 +25,18 @@
 static struct kmem_cache *nsproxy_cachep;
-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+struct nsproxy init_nsproxy = {
+        .count  = ATOMIC_INIT(1),
+        .uts_ns = &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+        .ipc_ns = &init_ipc_ns,
+#endif
+        .mnt_ns = NULL,
+        .pid_ns = &init_pid_ns,
+#ifdef CONFIG_NET
+        .net_ns = &init_net,
+#endif
+};
 static inline struct nsproxy *create_nsproxy(void)
 {
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..fdd8ae609ce3
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,774 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_OBJ_NUM 1000
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+        int cpu, target_cpu;
+        target_cpu = cpumask_first(pd->cpumask);
+        for (cpu = 0; cpu < cpu_index; cpu++)
+                target_cpu = cpumask_next(target_cpu, pd->cpumask);
+        return target_cpu;
+}
+static int padata_cpu_hash(struct padata_priv *padata)
+{
+        int cpu_index;
+        struct parallel_data *pd;
+        pd =  padata->pd;
+        /*
+         * Hash the sequence numbers to the cpus by taking
+         * seq_nr mod. number of cpus in use.
+         */
+        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask);
+        return padata_index_to_cpu(pd, cpu_index);
+}
+static void padata_parallel_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        struct padata_instance *pinst;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, pwork);
+        pd = queue->pd;
+        pinst = pd->pinst;
+        spin_lock(&queue->parallel.lock);
+        list_replace_init(&queue->parallel.list, &local_list);
+        spin_unlock(&queue->parallel.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->parallel(padata);
+        }
+        local_bh_enable();
+}
+/**
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ *          must be in the cpumask of padata.
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+                       struct padata_priv *padata, int cb_cpu)
+{
+        int target_cpu, err;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        rcu_read_lock_bh();
+        pd = rcu_dereference(pinst->pd);
+        err = 0;
+        if (!(pinst->flags & PADATA_INIT))
+                goto out;
+        err =  -EBUSY;
+        if ((pinst->flags & PADATA_RESET))
+                goto out;
+        if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+                goto out;
+        err = -EINVAL;
+        if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
+                goto out;
+        err = -EINPROGRESS;
+        atomic_inc(&pd->refcnt);
+        padata->pd = pd;
+        padata->cb_cpu = cb_cpu;
+        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+                atomic_set(&pd->seq_nr, -1);
+        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+        target_cpu = padata_cpu_hash(padata);
+        queue = per_cpu_ptr(pd->queue, target_cpu);
+        spin_lock(&queue->parallel.lock);
+        list_add_tail(&padata->list, &queue->parallel.list);
+        spin_unlock(&queue->parallel.lock);
+        queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+out:
+        rcu_read_unlock_bh();
+        return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+/*
+ * padata_get_next - Get the next object that needs serialization.
+ *
+ * Return values are:
+ *
+ * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ *
+ * NULL, if all percpu reorder queues are empty.
+ *
+ * -EINPROGRESS, if the next object that needs serialization will
+ *  be parallel processed by another cpu and is not yet present in
+ *  the cpu's reorder queue.
+ *
+ * -ENODATA, if this cpu has to do the parallel processing for
+ *  the next object.
+ */
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+        int cpu, num_cpus, empty, calc_seq_nr;
+        int seq_nr, next_nr, overrun, next_overrun;
+        struct padata_queue *queue, *next_queue;
+        struct padata_priv *padata;
+        struct padata_list *reorder;
+        empty = 0;
+        next_nr = -1;
+        next_overrun = 0;
+        next_queue = NULL;
+        num_cpus = cpumask_weight(pd->cpumask);
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                reorder = &queue->reorder;
+                /*
+                 * Calculate the seq_nr of the object that should be
+                 * next in this reorder queue.
+                 */
+                overrun = 0;
+                calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
+                               + queue->cpu_index;
+                if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
+                        calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+                        overrun = 1;
+                }
+                if (!list_empty(&reorder->list)) {
+                        padata = list_entry(reorder->list.next,
+                                            struct padata_priv, list);
+                        seq_nr  = padata->seq_nr;
+                        BUG_ON(calc_seq_nr != seq_nr);
+                } else {
+                        seq_nr = calc_seq_nr;
+                        empty++;
+                }
+                if (next_nr < 0 || seq_nr < next_nr
+                    || (next_overrun && !overrun)) {
+                        next_nr = seq_nr;
+                        next_overrun = overrun;
+                        next_queue = queue;
+                }
+        }
+        padata = NULL;
+        if (empty == num_cpus)
+                goto out;
+        reorder = &next_queue->reorder;
+        if (!list_empty(&reorder->list)) {
+                padata = list_entry(reorder->list.next,
+                                    struct padata_priv, list);
+                if (unlikely(next_overrun)) {
+                        for_each_cpu(cpu, pd->cpumask) {
+                                queue = per_cpu_ptr(pd->queue, cpu);
+                                atomic_set(&queue->num_obj, 0);
+                        }
+                }
+                spin_lock(&reorder->lock);
+                list_del_init(&padata->list);
+                atomic_dec(&pd->reorder_objects);
+                spin_unlock(&reorder->lock);
+                atomic_inc(&next_queue->num_obj);
+                goto out;
+        }
+        queue = per_cpu_ptr(pd->queue, smp_processor_id());
+        if (queue->cpu_index == next_queue->cpu_index) {
+                padata = ERR_PTR(-ENODATA);
+                goto out;
+        }
+        padata = ERR_PTR(-EINPROGRESS);
+out:
+        return padata;
+}
+static void padata_reorder(struct parallel_data *pd)
+{
+        struct padata_priv *padata;
+        struct padata_queue *queue;
+        struct padata_instance *pinst = pd->pinst;
+        /*
+         * We need to ensure that only one cpu can work on dequeueing of
+         * the reorder queue the time. Calculating in which percpu reorder
+         * queue the next object will arrive takes some time. A spinlock
+         * would be highly contended. Also it is not clear in which order
+         * the objects arrive to the reorder queues. So a cpu could wait to
+         * get the lock just to notice that there is nothing to do at the
+         * moment. Therefore we use a trylock and let the holder of the lock
+         * care for all the objects enqueued during the holdtime of the lock.
+         */
+        if (!spin_trylock_bh(&pd->lock))
+                return;
+        while (1) {
+                padata = padata_get_next(pd);
+                /*
+                 * All reorder queues are empty, or the next object that needs
+                 * serialization is parallel processed by another cpu and is
+                 * still on it's way to the cpu's reorder queue, nothing to
+                 * do for now.
+                 */
+                if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+                        break;
+                /*
+                 * This cpu has to do the parallel processing of the next
+                 * object. It's waiting in the cpu's parallelization queue,
+                 * so exit imediately.
+                 */
+                if (PTR_ERR(padata) == -ENODATA) {
+                        del_timer(&pd->timer);
+                        spin_unlock_bh(&pd->lock);
+                        return;
+                }
+                queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+                spin_lock(&queue->serial.lock);
+                list_add_tail(&padata->list, &queue->serial.list);
+                spin_unlock(&queue->serial.lock);
+                queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+        }
+        spin_unlock_bh(&pd->lock);
+        /*
+         * The next object that needs serialization might have arrived to
+         * the reorder queues in the meantime, we will be called again
+         * from the timer function if noone else cares for it.
+         */
+        if (atomic_read(&pd->reorder_objects)
+                        && !(pinst->flags & PADATA_RESET))
+                mod_timer(&pd->timer, jiffies + HZ);
+        else
+                del_timer(&pd->timer);
+        return;
+}
+static void padata_reorder_timer(unsigned long arg)
+{
+        struct parallel_data *pd = (struct parallel_data *)arg;
+        padata_reorder(pd);
+}
+static void padata_serial_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, swork);
+        pd = queue->pd;
+        spin_lock(&queue->serial.lock);
+        list_replace_init(&queue->serial.list, &local_list);
+        spin_unlock(&queue->serial.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->serial(padata);
+                atomic_dec(&pd->refcnt);
+        }
+        local_bh_enable();
+}
+/**
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+        int cpu;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        pd = padata->pd;
+        cpu = get_cpu();
+        queue = per_cpu_ptr(pd->queue, cpu);
+        spin_lock(&queue->reorder.lock);
+        atomic_inc(&pd->reorder_objects);
+        list_add_tail(&padata->list, &queue->reorder.list);
+        spin_unlock(&queue->reorder.lock);
+        put_cpu();
+        padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+/* Allocate and initialize the internal cpumask dependend resources. */
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+                                             const struct cpumask *cpumask)
+{
+        int cpu, cpu_index, num_cpus;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        cpu_index = 0;
+        pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+        if (!pd)
+                goto err;
+        pd->queue = alloc_percpu(struct padata_queue);
+        if (!pd->queue)
+                goto err_free_pd;
+        if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
+                goto err_free_queue;
+        cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                queue->pd = pd;
+                queue->cpu_index = cpu_index;
+                cpu_index++;
+                INIT_LIST_HEAD(&queue->reorder.list);
+                INIT_LIST_HEAD(&queue->parallel.list);
+                INIT_LIST_HEAD(&queue->serial.list);
+                spin_lock_init(&queue->reorder.lock);
+                spin_lock_init(&queue->parallel.lock);
+                spin_lock_init(&queue->serial.lock);
+                INIT_WORK(&queue->pwork, padata_parallel_worker);
+                INIT_WORK(&queue->swork, padata_serial_worker);
+                atomic_set(&queue->num_obj, 0);
+        }
+        num_cpus = cpumask_weight(pd->cpumask);
+        pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+        atomic_set(&pd->seq_nr, -1);
+        atomic_set(&pd->reorder_objects, 0);
+        atomic_set(&pd->refcnt, 0);
+        pd->pinst = pinst;
+        spin_lock_init(&pd->lock);
+        return pd;
+err_free_queue:
+        free_percpu(pd->queue);
+err_free_pd:
+        kfree(pd);
+err:
+        return NULL;
+}
+static void padata_free_pd(struct parallel_data *pd)
+{
+        free_cpumask_var(pd->cpumask);
+        free_percpu(pd->queue);
+        kfree(pd);
+}
+/* Flush all objects out of the padata queues. */
+static void padata_flush_queues(struct parallel_data *pd)
+{
+        int cpu;
+        struct padata_queue *queue;
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                flush_work(&queue->pwork);
+        }
+        del_timer_sync(&pd->timer);
+        if (atomic_read(&pd->reorder_objects))
+                padata_reorder(pd);
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                flush_work(&queue->swork);
+        }
+        BUG_ON(atomic_read(&pd->refcnt) != 0);
+}
+/* Replace the internal control stucture with a new one. */
+static void padata_replace(struct padata_instance *pinst,
+                           struct parallel_data *pd_new)
+{
+        struct parallel_data *pd_old = pinst->pd;
+        pinst->flags |= PADATA_RESET;
+        rcu_assign_pointer(pinst->pd, pd_new);
+        synchronize_rcu();
+        padata_flush_queues(pd_old);
+        padata_free_pd(pd_old);
+        pinst->flags &= ~PADATA_RESET;
+}
+/**
+ * padata_set_cpumask - set the cpumask that padata should use
+ *
+ * @pinst: padata instance
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst,
+                        cpumask_var_t cpumask)
+{
+        struct parallel_data *pd;
+        int err = 0;
+        mutex_lock(&pinst->lock);
+        get_online_cpus();
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd) {
+                err = -ENOMEM;
+                goto out;
+        }
+        cpumask_copy(pinst->cpumask, cpumask);
+        padata_replace(pinst, pd);
+out:
+        put_online_cpus();
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/**
+ * padata_add_cpu - add a cpu to the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ */
+int padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        mutex_lock(&pinst->lock);
+        get_online_cpus();
+        cpumask_set_cpu(cpu, pinst->cpumask);
+        err = __padata_add_cpu(pinst, cpu);
+        put_online_cpus();
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/**
+ * padata_remove_cpu - remove a cpu from the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        mutex_lock(&pinst->lock);
+        get_online_cpus();
+        cpumask_clear_cpu(cpu, pinst->cpumask);
+        err = __padata_remove_cpu(pinst, cpu);
+        put_online_cpus();
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+void padata_start(struct padata_instance *pinst)
+{
+        mutex_lock(&pinst->lock);
+        pinst->flags |= PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_start);
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+        mutex_lock(&pinst->lock);
+        pinst->flags &= ~PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+#ifdef CONFIG_HOTPLUG_CPU
+static int padata_cpu_callback(struct notifier_block *nfb,
+                               unsigned long action, void *hcpu)
+{
+        int err;
+        struct padata_instance *pinst;
+        int cpu = (unsigned long)hcpu;
+        pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return notifier_from_errno(err);
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return notifier_from_errno(err);
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        }
+        return NOTIFY_OK;
+}
+#endif
+/**
+ * padata_alloc - allocate and initialize a padata instance
+ *
+ * @cpumask: cpumask that padata uses for parallelization
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc(const struct cpumask *cpumask,
+                                     struct workqueue_struct *wq)
+{
+        struct padata_instance *pinst;
+        struct parallel_data *pd;
+        pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+        if (!pinst)
+                goto err;
+        get_online_cpus();
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd)
+                goto err_free_inst;
+        if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
+                goto err_free_pd;
+        rcu_assign_pointer(pinst->pd, pd);
+        pinst->wq = wq;
+        cpumask_copy(pinst->cpumask, cpumask);
+        pinst->flags = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+        pinst->cpu_notifier.priority = 0;
+        register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+        put_online_cpus();
+        mutex_init(&pinst->lock);
+        return pinst;
+err_free_pd:
+        padata_free_pd(pd);
+err_free_inst:
+        kfree(pinst);
+        put_online_cpus();
+err:
+        return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+/**
+ * padata_free - free a padata instance
+ *
+ * @padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+        padata_stop(pinst);
+        synchronize_rcu();
+#ifdef CONFIG_HOTPLUG_CPU
+        unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+        get_online_cpus();
+        padata_flush_queues(pinst->pd);
+        put_online_cpus();
+        padata_free_pd(pinst->pd);
+        free_cpumask_var(pinst->cpumask);
+        kfree(pinst);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..3b16cd93fa7d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
-static long no_blink(long time)
-{
-        return 0;
-}
 /* Returns how long it waited in ms */
 long (*panic_blink)(long time);
 EXPORT_SYMBOL(panic_blink);
+static void panic_blink_one_second(void)
+{
+        static long i = 0, end;
+        if (panic_blink) {
+                end = i + MSEC_PER_SEC;
+                while (i < end) {
+                        i += panic_blink(i);
+                        mdelay(1);
+                        i++;
+                }
+        } else {
+                /*
+                 * When running under a hypervisor a small mdelay may get
+                 * rounded up to the hypervisor timeslice. For example, with
+                 * a 1ms in 10ms hypervisor timeslice we might inflate a
+                 * mdelay(1) loop by 10x.
+                 *
+                 * If we have nothing to blink, spin on 1 second calls to
+                 * mdelay to avoid this.
+                 */
+                mdelay(MSEC_PER_SEC);
+        }
+}
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -66,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...)
         */
        preempt_disable();
+        console_verbose();
        bust_spinlocks(1);
        va_start(args, fmt);
        vsnprintf(buf, sizeof(buf), fmt, args);
@@ -95,9 +117,6 @@ NORET_TYPE void panic(const char * fmt, ...)
        bust_spinlocks(0);
-        if (!panic_blink)
-                panic_blink = no_blink;
        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +124,9 @@ NORET_TYPE void panic(const char * fmt, ...)
                 */
                printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
-                for (i = 0; i < panic_timeout*1000; ) {
+                for (i = 0; i < panic_timeout; i++) {
                        touch_nmi_watchdog();
-                        i += panic_blink(i);
+                        panic_blink_one_second();
-                        mdelay(1);
-                        i++;
                }
                /*
                 * This will not be a clean reboot, with everything
@@ -135,11 +152,9 @@ NORET_TYPE void panic(const char * fmt, ...)
        }
 #endif
        local_irq_enable();
-        for (i = 0; ; ) {
+        while (1) {
                touch_softlockup_watchdog();
-                i += panic_blink(i);
+                panic_blink_one_second();
-                mdelay(1);
-                i++;
        }
 }
@@ -164,6 +179,7 @@ static const struct tnt tnts[] = {
        { TAINT_OVERRIDDEN_ACPI_TABLE,  'A', ' ' },
        { TAINT_WARN,                   'W', ' ' },
        { TAINT_CRAP,                   'C', ' ' },
+        { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
 };
 /**
@@ -180,6 +196,7 @@ static const struct tnt tnts[] = {
 *  'A' - ACPI table overridden.
 *  'W' - Taint on warning.
 *  'C' - modules from drivers/staging are loaded.
+ *  'I' - Working around severe firmware bug.
 *
 *      The string is overwritten by the next call to print_tainted().
 */
@@ -351,7 +368,8 @@ struct slowpath_args {
        va_list args;
 };
-static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
+static void warn_slowpath_common(const char *file, int line, void *caller,
+                                 unsigned taint, struct slowpath_args *args)
 {
        const char *board;
@@ -367,7 +385,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
        print_modules();
        dump_stack();
        print_oops_end_marker();
-        add_taint(TAINT_WARN);
+        add_taint(taint);
 }
 void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -376,14 +394,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
        args.fmt = fmt;
        va_start(args.args, fmt);
-        warn_slowpath_common(file, line, __builtin_return_address(0), &args);
+        warn_slowpath_common(file, line, __builtin_return_address(0),
+                             TAINT_WARN, &args);
        va_end(args.args);
 }
 EXPORT_SYMBOL(warn_slowpath_fmt);
+void warn_slowpath_fmt_taint(const char *file, int line,
+                             unsigned taint, const char *fmt, ...)
+{
+        struct slowpath_args args;
+        args.fmt = fmt;
+        va_start(args.args, fmt);
+        warn_slowpath_common(file, line, __builtin_return_address(0),
+                             taint, &args);
+        va_end(args.args);
+}
+EXPORT_SYMBOL(warn_slowpath_fmt_taint);
 void warn_slowpath_null(const char *file, int line)
 {
-        warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
+        warn_slowpath_common(file, line, __builtin_return_address(0),
+                             TAINT_WARN, NULL);
 }
 EXPORT_SYMBOL(warn_slowpath_null);
 #endif
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#include <linux/string.h>
 #if 0
 #define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 }
 /* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
 extern struct kernel_param __start___param[], __stop___param[];
@@ -421,7 +420,7 @@ struct module_param_attrs
 };
 #ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 static ssize_t param_attr_show(struct module_attribute *mattr,
                               struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
        new->grp.attrs = attrs;
        /* Tack new one on the end. */
+        sysfs_attr_init(&new->attrs[num].mattr.attr);
        new->attrs[num].param = kp;
        new->attrs[num].mattr.show = param_attr_show;
        new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
        return ret;
 }
-static struct sysfs_ops module_sysfs_ops = {
+static const struct sysfs_ops module_sysfs_ops = {
        .show = module_attr_show,
        .store = module_attr_store,
 };
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
        return 0;
 }
-static struct kset_uevent_ops module_uevent_ops = {
+static const struct kset_uevent_ops module_uevent_ops = {
        .filter = uevent_filter,
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index d27746bd3a06..ff86c558af4c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,8 @@
 #include <linux/smp.h>
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -56,21 +58,6 @@ static atomic_t nr_task_events __read_mostly;
 */
 int sysctl_perf_event_paranoid __read_mostly = 1;
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-        return sysctl_perf_event_paranoid > -1;
-}
-static inline bool perf_paranoid_cpu(void)
-{
-        return sysctl_perf_event_paranoid > 0;
-}
-static inline bool perf_paranoid_kernel(void)
-{
-        return sysctl_perf_event_paranoid > 1;
-}
 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
 /*
@@ -96,40 +83,19 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)               { barrier(); }
 void __weak hw_perf_enable(void)                { barrier(); }
-void __weak hw_perf_event_setup(int cpu)        { barrier(); }
-void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
-int __weak
-hw_perf_group_sched_in(struct perf_event *group_leader,
-               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx, int cpu)
-{
-        return 0;
-}
 void __weak perf_event_print_debug(void)        { }
 static DEFINE_PER_CPU(int, perf_disable_count);
-void __perf_disable(void)
-{
-        __get_cpu_var(perf_disable_count)++;
-}
-bool __perf_enable(void)
-{
-        return !--__get_cpu_var(perf_disable_count);
-}
 void perf_disable(void)
 {
-        __perf_disable();
+        if (!__get_cpu_var(perf_disable_count)++)
-        hw_perf_disable();
+                hw_perf_disable();
 }
 void perf_enable(void)
 {
-        if (__perf_enable())
+        if (!--__get_cpu_var(perf_disable_count))
                hw_perf_enable();
 }
@@ -248,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 static inline u64 perf_clock(void)
 {
-        return cpu_clock(smp_processor_id());
+        return cpu_clock(raw_smp_processor_id());
 }
 /*
@@ -290,24 +256,49 @@ static void update_event_times(struct perf_event *event)
 }
 /*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+        struct perf_event *event;
+        update_event_times(leader);
+        list_for_each_entry(event, &leader->sibling_list, group_entry)
+                update_event_times(event);
+}
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+        if (event->attr.pinned)
+                return &ctx->pinned_groups;
+        else
+                return &ctx->flexible_groups;
+}
+/*
 * Add a event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-        struct perf_event *group_leader = event->group_leader;
+        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+        event->attach_state |= PERF_ATTACH_CONTEXT;
        /*
-         * Depending on whether it is a standalone or sibling event,
+         * If we're a stand alone event or group leader, we go to the context
-         * add it straight to the context's event list, or to the group
+         * list, group events are kept attached to the group so that
-         * leader's sibling list:
+         * perf_group_detach can, at all times, locate all siblings.
         */
-        if (group_leader == event)
+        if (event->group_leader == event) {
-                list_add_tail(&event->group_entry, &ctx->group_list);
+                struct list_head *list;
-        else {
-                list_add_tail(&event->group_entry, &group_leader->sibling_list);
+                if (is_software_event(event))
-                group_leader->nr_siblings++;
+                        event->group_flags |= PERF_GROUP_SOFTWARE;
+                list = ctx_group_list(event, ctx);
+                list_add_tail(&event->group_entry, list);
        }
        list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -316,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                ctx->nr_stat++;
 }
+static void perf_group_attach(struct perf_event *event)
+{
+        struct perf_event *group_leader = event->group_leader;
+        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
+        event->attach_state |= PERF_ATTACH_GROUP;
+        if (group_leader == event)
+                return;
+        if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+                        !is_software_event(event))
+                group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+        list_add_tail(&event->group_entry, &group_leader->sibling_list);
+        group_leader->nr_siblings++;
+}
 /*
 * Remove a event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
@@ -323,21 +332,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-        struct perf_event *sibling, *tmp;
+        /*
+         * We can have double detach due to exit/hot-unplug + close.
-        if (list_empty(&event->group_entry))
+         */
+        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;
+        event->attach_state &= ~PERF_ATTACH_CONTEXT;
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
-        list_del_init(&event->group_entry);
        list_del_rcu(&event->event_entry);
-        if (event->group_leader != event)
+        if (event->group_leader == event)
-                event->group_leader->nr_siblings--;
+                list_del_init(&event->group_entry);
-        update_event_times(event);
+        update_group_times(event);
        /*
         * If event was in error state, then keep it
@@ -348,16 +360,45 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
         */
        if (event->state > PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_OFF;
+}
+static void perf_group_detach(struct perf_event *event)
+{
+        struct perf_event *sibling, *tmp;
+        struct list_head *list = NULL;
+        /*
+         * We can have double detach due to exit/hot-unplug + close.
+         */
+        if (!(event->attach_state & PERF_ATTACH_GROUP))
+                return;
+        event->attach_state &= ~PERF_ATTACH_GROUP;
+        /*
+         * If this is a sibling, remove it from its group.
+         */
+        if (event->group_leader != event) {
+                list_del_init(&event->group_entry);
+                event->group_leader->nr_siblings--;
+                return;
+        }
+        if (!list_empty(&event->group_entry))
+                list = &event->group_entry;
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
-         * to the context list directly:
+         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+                if (list)
-                list_move_tail(&sibling->group_entry, &ctx->group_list);
+                        list_move_tail(&sibling->group_entry, list);
                sibling->group_leader = sibling;
+                /* Inherit group flags from the previous leader */
+                sibling->group_flags = event->group_flags;
        }
 }
@@ -508,18 +549,6 @@ retry:
 }
 /*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-        struct perf_event *event;
-        update_event_times(leader);
-        list_for_each_entry(event, &leader->sibling_list, group_entry)
-                update_event_times(event);
-}
-/*
 * Cross CPU call to disable a performance event
 */
 static void __perf_event_disable(void *info)
@@ -608,14 +637,13 @@ void perf_event_disable(struct perf_event *event)
 static int
 event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx,
+                 struct perf_event_context *ctx)
-                 int cpu)
 {
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
        event->state = PERF_EVENT_STATE_ACTIVE;
-        event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
+        event->oncpu = smp_processor_id();
        /*
         * The new state must be visible before we turn it on in the hardware:
         */
@@ -642,33 +670,47 @@ event_sched_in(struct perf_event *event,
 static int
 group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx,
+               struct perf_event_context *ctx)
-               int cpu)
 {
-        struct perf_event *event, *partial_group;
+        struct perf_event *event, *partial_group = NULL;
+        const struct pmu *pmu = group_event->pmu;
+        bool txn = false;
        int ret;
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+        /* Check if group transaction availabe */
-        if (ret)
+        if (pmu->start_txn)
-                return ret < 0 ? ret : 0;
+                txn = true;
-        if (event_sched_in(group_event, cpuctx, ctx, cpu))
+        if (txn)
+                pmu->start_txn(pmu);
+        if (event_sched_in(group_event, cpuctx, ctx)) {
+                if (txn)
+                        pmu->cancel_txn(pmu);
                return -EAGAIN;
+        }
        /*
         * Schedule in siblings as one group (if any):
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-                if (event_sched_in(event, cpuctx, ctx, cpu)) {
+                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }
-        return 0;
+        if (!txn)
+                return 0;
+        ret = pmu->commit_txn(pmu);
+        if (!ret) {
+                pmu->cancel_txn(pmu);
+                return 0;
+        }
 group_error:
        /*
@@ -682,25 +724,10 @@ group_error:
        }
        event_sched_out(group_event, cpuctx, ctx);
-        return -EAGAIN;
+        if (txn)
-}
+                pmu->cancel_txn(pmu);
-/*
- * Return 1 for a group consisting entirely of software events,
- * 0 if the group contains any hardware events.
- */
-static int is_software_only_group(struct perf_event *leader)
-{
-        struct perf_event *event;
-        if (!is_software_event(leader))
-                return 0;
-        list_for_each_entry(event, &leader->sibling_list, group_entry)
-                if (!is_software_event(event))
-                        return 0;
-        return 1;
+        return -EAGAIN;
 }
 /*
@@ -713,7 +740,7 @@ static int group_can_go_on(struct perf_event *event,
        /*
         * Groups consisting entirely of software events can always go on.
         */
-        if (is_software_only_group(event))
+        if (event->group_flags & PERF_GROUP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
@@ -738,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
 {
        list_add_event(event, ctx);
+        perf_group_attach(event);
        event->tstamp_enabled = ctx->time;
        event->tstamp_running = ctx->time;
        event->tstamp_stopped = ctx->time;
@@ -754,7 +782,6 @@ static void __perf_install_in_context(void *info)
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
-        int cpu = smp_processor_id();
        int err;
        /*
@@ -801,7 +828,7 @@ static void __perf_install_in_context(void *info)
        if (!group_can_go_on(event, cpuctx, 1))
                err = -EEXIST;
        else
-                err = event_sched_in(event, cpuctx, ctx, cpu);
+                err = event_sched_in(event, cpuctx, ctx);
        if (err) {
                /*
@@ -943,11 +970,9 @@ static void __perf_event_enable(void *info)
        } else {
                perf_disable();
                if (event == leader)
-                        err = group_sched_in(event, cpuctx, ctx,
+                        err = group_sched_in(event, cpuctx, ctx);
-                                             smp_processor_id());
                else
-                        err = event_sched_in(event, cpuctx, ctx,
+                        err = event_sched_in(event, cpuctx, ctx);
-                                               smp_processor_id());
                perf_enable();
        }
@@ -1043,8 +1068,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
-void __perf_event_sched_out(struct perf_event_context *ctx,
+enum event_type_t {
-                              struct perf_cpu_context *cpuctx)
+        EVENT_FLEXIBLE = 0x1,
+        EVENT_PINNED = 0x2,
+        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+static void ctx_sched_out(struct perf_event_context *ctx,
+                          struct perf_cpu_context *cpuctx,
+                          enum event_type_t event_type)
 {
        struct perf_event *event;
@@ -1055,10 +1087,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
        update_context_time(ctx);
        perf_disable();
-        if (ctx->nr_active) {
+        if (!ctx->nr_active)
-                list_for_each_entry(event, &ctx->group_list, group_entry)
+                goto out_enable;
+        if (event_type & EVENT_PINNED)
+                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
-        }
+        if (event_type & EVENT_FLEXIBLE)
+                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+                        group_sched_out(event, cpuctx, ctx);
+ out_enable:
        perf_enable();
 out:
        raw_spin_unlock(&ctx->lock);
@@ -1170,17 +1210,15 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 * not restart the event.
 */
 void perf_event_task_sched_out(struct task_struct *task,
-                                 struct task_struct *next, int cpu)
+                                 struct task_struct *next)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
-        struct pt_regs *regs;
        int do_switch = 1;
-        regs = task_pt_regs(task);
+        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
        if (likely(!ctx || !cpuctx->task_ctx))
                return;
@@ -1220,15 +1258,13 @@ void perf_event_task_sched_out(struct task_struct *task,
        rcu_read_unlock();
        if (do_switch) {
-                __perf_event_sched_out(ctx, cpuctx);
+                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
                cpuctx->task_ctx = NULL;
        }
 }
-/*
+static void task_ctx_sched_out(struct perf_event_context *ctx,
- * Called with IRQs disabled
+                               enum event_type_t event_type)
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 {
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -1238,47 +1274,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
-        __perf_event_sched_out(ctx, cpuctx);
+        ctx_sched_out(ctx, cpuctx, event_type);
        cpuctx->task_ctx = NULL;
 }
 /*
 * Called with IRQs disabled
 */
-static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 {
-        __perf_event_sched_out(&cpuctx->ctx, cpuctx);
+        task_ctx_sched_out(ctx, EVENT_ALL);
+}
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+                              enum event_type_t event_type)
+{
+        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 static void
-__perf_event_sched_in(struct perf_event_context *ctx,
+ctx_pinned_sched_in(struct perf_event_context *ctx,
-                        struct perf_cpu_context *cpuctx, int cpu)
+                    struct perf_cpu_context *cpuctx)
 {
        struct perf_event *event;
-        int can_add_hw = 1;
-        raw_spin_lock(&ctx->lock);
-        ctx->is_active = 1;
-        if (likely(!ctx->nr_events))
-                goto out;
-        ctx->timestamp = perf_clock();
-        perf_disable();
+        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+                if (event->state <= PERF_EVENT_STATE_OFF)
-        /*
-         * First go through the list and put on any pinned groups
-         * in order to give them the best chance of going on.
-         */
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
-                if (event->state <= PERF_EVENT_STATE_OFF ||
-                    !event->attr.pinned)
                        continue;
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, 1))
-                        group_sched_in(event, cpuctx, ctx, cpu);
+                        group_sched_in(event, cpuctx, ctx);
                /*
                 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1319,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
                        event->state = PERF_EVENT_STATE_ERROR;
                }
        }
+}
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
+static void
-                /*
+ctx_flexible_sched_in(struct perf_event_context *ctx,
-                 * Ignore events in OFF or ERROR state, and
+                      struct perf_cpu_context *cpuctx)
-                 * ignore pinned events since we did them already.
+{
-                 */
+        struct perf_event *event;
-                if (event->state <= PERF_EVENT_STATE_OFF ||
+        int can_add_hw = 1;
-                    event->attr.pinned)
-                        continue;
+        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+                /* Ignore events in OFF or ERROR state */
+                if (event->state <= PERF_EVENT_STATE_OFF)
+                        continue;
                /*
                 * Listen to the 'cpu' scheduling filter constraint
                 * of events:
                 */
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, can_add_hw))
-                        if (group_sched_in(event, cpuctx, ctx, cpu))
+                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
        }
+}
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+             struct perf_cpu_context *cpuctx,
+             enum event_type_t event_type)
+{
+        raw_spin_lock(&ctx->lock);
+        ctx->is_active = 1;
+        if (likely(!ctx->nr_events))
+                goto out;
+        ctx->timestamp = perf_clock();
+        perf_disable();
+        /*
+         * First go through the list and put on any pinned groups
+         * in order to give them the best chance of going on.
+         */
+        if (event_type & EVENT_PINNED)
+                ctx_pinned_sched_in(ctx, cpuctx);
+        /* Then walk through the lower prio flexible groups */
+        if (event_type & EVENT_FLEXIBLE)
+                ctx_flexible_sched_in(ctx, cpuctx);
        perf_enable();
 out:
        raw_spin_unlock(&ctx->lock);
 }
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+                             enum event_type_t event_type)
+{
+        struct perf_event_context *ctx = &cpuctx->ctx;
+        ctx_sched_in(ctx, cpuctx, event_type);
+}
+static void task_ctx_sched_in(struct task_struct *task,
+                              enum event_type_t event_type)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_event_context *ctx = task->perf_event_ctxp;
+        if (likely(!ctx))
+                return;
+        if (cpuctx->task_ctx == ctx)
+                return;
+        ctx_sched_in(ctx, cpuctx, event_type);
+        cpuctx->task_ctx = ctx;
+}
 /*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
@@ -1326,38 +1407,135 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
-void perf_event_task_sched_in(struct task_struct *task, int cpu)
+void perf_event_task_sched_in(struct task_struct *task)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = task->perf_event_ctxp;
        if (likely(!ctx))
                return;
        if (cpuctx->task_ctx == ctx)
                return;
-        __perf_event_sched_in(ctx, cpuctx, cpu);
+        perf_disable();
+        /*
+         * We want to keep the following priority order:
+         * cpu pinned (that don't need to move), task pinned,
+         * cpu flexible, task flexible.
+         */
+        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
        cpuctx->task_ctx = ctx;
+        perf_enable();
 }
-static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+#define MAX_INTERRUPTS (~0ULL)
+static void perf_log_throttle(struct perf_event *event, int enable);
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
-        struct perf_event_context *ctx = &cpuctx->ctx;
+        u64 frequency = event->attr.sample_freq;
+        u64 sec = NSEC_PER_SEC;
+        u64 divisor, dividend;
-        __perf_event_sched_in(ctx, cpuctx, cpu);
+        int count_fls, nsec_fls, frequency_fls, sec_fls;
+        count_fls = fls64(count);
+        nsec_fls = fls64(nsec);
+        frequency_fls = fls64(frequency);
+        sec_fls = 30;
+        /*
+         * We got @count in @nsec, with a target of sample_freq HZ
+         * the target period becomes:
+         *
+         *             @count * 10^9
+         * period = -------------------
+         *          @nsec * sample_freq
+         *
+         */
+        /*
+         * Reduce accuracy by one bit such that @a and @b converge
+         * to a similar magnitude.
+         */
+#define REDUCE_FLS(a, b)                \
+do {                                    \
+        if (a##_fls > b##_fls) {        \
+                a >>= 1;                \
+                a##_fls--;              \
+        } else {                        \
+                b >>= 1;                \
+                b##_fls--;              \
+        }                               \
+} while (0)
+        /*
+         * Reduce accuracy until either term fits in a u64, then proceed with
+         * the other, so that finally we can do a u64/u64 division.
+         */
+        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+                REDUCE_FLS(nsec, frequency);
+                REDUCE_FLS(sec, count);
+        }
+        if (count_fls + sec_fls > 64) {
+                divisor = nsec * frequency;
+                while (count_fls + sec_fls > 64) {
+                        REDUCE_FLS(count, sec);
+                        divisor >>= 1;
+                }
+                dividend = count * sec;
+        } else {
+                dividend = count * sec;
+                while (nsec_fls + frequency_fls > 64) {
+                        REDUCE_FLS(nsec, frequency);
+                        dividend >>= 1;
+                }
+                divisor = nsec * frequency;
+        }
+        if (!divisor)
+                return dividend;
+        return div64_u64(dividend, divisor);
 }
-#define MAX_INTERRUPTS (~0ULL)
+static void perf_event_stop(struct perf_event *event)
+{
+        if (!event->pmu->stop)
+                return event->pmu->disable(event);
-static void perf_log_throttle(struct perf_event *event, int enable);
+        return event->pmu->stop(event);
+}
+static int perf_event_start(struct perf_event *event)
+{
+        if (!event->pmu->start)
+                return event->pmu->enable(event);
+        return event->pmu->start(event);
+}
-static void perf_adjust_period(struct perf_event *event, u64 events)
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
-        u64 period, sample_period;
+        s64 period, sample_period;
        s64 delta;
-        events *= hwc->sample_period;
+        period = perf_calculate_period(event, nsec, count);
-        period = div64_u64(events, event->attr.sample_freq);
        delta = (s64)(period - hwc->sample_period);
        delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1546,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
                sample_period = 1;
        hwc->sample_period = sample_period;
+        if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+                perf_disable();
+                perf_event_stop(event);
+                atomic64_set(&hwc->period_left, 0);
+                perf_event_start(event);
+                perf_enable();
+        }
 }
 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
-        u64 interrupts, freq;
+        u64 interrupts, now;
+        s64 delta;
        raw_spin_lock(&ctx->lock);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1394,45 +1581,23 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
+                        perf_disable();
                        event->pmu->unthrottle(event);
-                        interrupts = 2*sysctl_perf_event_sample_rate/HZ;
+                        perf_enable();
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                /*
+                perf_disable();
-                 * if the specified freq < HZ then we need to skip ticks
+                event->pmu->read(event);
-                 */
+                now = atomic64_read(&event->count);
-                if (event->attr.sample_freq < HZ) {
+                delta = now - hwc->freq_count_stamp;
-                        freq = event->attr.sample_freq;
+                hwc->freq_count_stamp = now;
-                        hwc->freq_count += freq;
-                        hwc->freq_interrupts += interrupts;
-                        if (hwc->freq_count < HZ)
-                                continue;
-                        interrupts = hwc->freq_interrupts;
-                        hwc->freq_interrupts = 0;
-                        hwc->freq_count -= HZ;
-                } else
-                        freq = HZ;
-                perf_adjust_period(event, freq * interrupts);
-                /*
+                if (delta > 0)
-                 * In order to avoid being stalled by an (accidental) huge
+                        perf_adjust_period(event, TICK_NSEC, delta);
-                 * sample period, force reset the sample period if we didn't
+                perf_enable();
-                 * get any events in this freq period.
-                 */
-                if (!interrupts) {
-                        perf_disable();
-                        event->pmu->disable(event);
-                        atomic64_set(&hwc->period_left, 0);
-                        event->pmu->enable(event);
-                        perf_enable();
-                }
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1442,51 +1607,67 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-        struct perf_event *event;
-        if (!ctx->nr_events)
-                return;
        raw_spin_lock(&ctx->lock);
-        /*
-         * Rotate the first entry last (works just fine for group events too):
+        /* Rotate the first entry last of non-pinned groups */
-         */
+        list_rotate_left(&ctx->flexible_groups);
-        perf_disable();
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
-                list_move_tail(&event->group_entry, &ctx->group_list);
-                break;
-        }
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
 }
-void perf_event_task_tick(struct task_struct *curr, int cpu)
+void perf_event_task_tick(struct task_struct *curr)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
+        int rotate = 0;
        if (!atomic_read(&nr_events))
                return;
-        cpuctx = &per_cpu(perf_cpu_context, cpu);
+        cpuctx = &__get_cpu_var(perf_cpu_context);
+        if (cpuctx->ctx.nr_events &&
+            cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                rotate = 1;
        ctx = curr->perf_event_ctxp;
+        if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+                rotate = 1;
        perf_ctx_adjust_freq(&cpuctx->ctx);
        if (ctx)
                perf_ctx_adjust_freq(ctx);
-        perf_event_cpu_sched_out(cpuctx);
+        if (!rotate)
+                return;
+        perf_disable();
+        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                __perf_event_task_sched_out(ctx);
+                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
        rotate_ctx(&cpuctx->ctx);
        if (ctx)
                rotate_ctx(ctx);
-        perf_event_cpu_sched_in(cpuctx, cpu);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                perf_event_task_sched_in(curr, cpu);
+                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+        perf_enable();
+}
+static int event_enable_on_exec(struct perf_event *event,
+                                struct perf_event_context *ctx)
+{
+        if (!event->attr.enable_on_exec)
+                return 0;
+        event->attr.enable_on_exec = 0;
+        if (event->state >= PERF_EVENT_STATE_INACTIVE)
+                return 0;
+        __perf_event_mark_enabled(event, ctx);
+        return 1;
 }
 /*
@@ -1499,6 +1680,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
+        int ret;
        local_irq_save(flags);
        ctx = task->perf_event_ctxp;
@@ -1509,14 +1691,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_lock(&ctx->lock);
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
+        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-                if (!event->attr.enable_on_exec)
+                ret = event_enable_on_exec(event, ctx);
-                        continue;
+                if (ret)
-                event->attr.enable_on_exec = 0;
+                        enabled = 1;
-                if (event->state >= PERF_EVENT_STATE_INACTIVE)
+        }
-                        continue;
-                __perf_event_mark_enabled(event, ctx);
+        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-                enabled = 1;
+                ret = event_enable_on_exec(event, ctx);
+                if (ret)
+                        enabled = 1;
        }
        /*
@@ -1527,7 +1711,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_unlock(&ctx->lock);
-        perf_event_task_sched_in(task, smp_processor_id());
+        perf_event_task_sched_in(task);
 out:
        local_irq_restore(flags);
 }
@@ -1590,7 +1774,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
-        INIT_LIST_HEAD(&ctx->group_list);
+        INIT_LIST_HEAD(&ctx->pinned_groups);
+        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
        ctx->task = task;
@@ -1698,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 static void perf_pending_sync(struct perf_event *event);
+static void perf_mmap_data_put(struct perf_mmap_data *data);
 static void free_event(struct perf_event *event)
 {
@@ -1713,9 +1899,9 @@ static void free_event(struct perf_event *event)
                        atomic_dec(&nr_task_events);
        }
-        if (event->output) {
+        if (event->data) {
-                fput(event->output->filp);
+                perf_mmap_data_put(event->data);
-                event->output = NULL;
+                event->data = NULL;
        }
        if (event->destroy)
@@ -1729,9 +1915,30 @@ int perf_event_release_kernel(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
+        /*
+         * Remove from the PMU, can't get re-enabled since we got
+         * here because the last ref went.
+         */
+        perf_event_disable(event);
        WARN_ON_ONCE(ctx->parent_ctx);
-        mutex_lock(&ctx->mutex);
+        /*
-        perf_event_remove_from_context(event);
+         * There are two ways this annotation is useful:
+         *
+         *  1) there is a lock recursion from perf_event_exit_task
+         *     see the comment there.
+         *
+         *  2) there is a lock-inversion with mmap_sem through
+         *     perf_event_read_group(), which takes faults while
+         *     holding ctx->mutex, however this is called after
+         *     the last filedesc died, so there is no possibility
+         *     to trigger the AB-BA case.
+         */
+        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+        raw_spin_lock_irq(&ctx->lock);
+        perf_group_detach(event);
+        list_del_event(event, ctx);
+        raw_spin_unlock_irq(&ctx->lock);
        mutex_unlock(&ctx->mutex);
        mutex_lock(&event->owner->perf_event_mutex);
@@ -2011,7 +2218,27 @@ unlock:
        return ret;
 }
-static int perf_event_set_output(struct perf_event *event, int output_fd);
+static const struct file_operations perf_fops;
+static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+{
+        struct file *file;
+        file = fget_light(fd, fput_needed);
+        if (!file)
+                return ERR_PTR(-EBADF);
+        if (file->f_op != &perf_fops) {
+                fput_light(file, *fput_needed);
+                *fput_needed = 0;
+                return ERR_PTR(-EBADF);
+        }
+        return file->private_data;
+}
+static int perf_event_set_output(struct perf_event *event,
+                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2038,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                return perf_event_period(event, (u64 __user *)arg);
        case PERF_EVENT_IOC_SET_OUTPUT:
-                return perf_event_set_output(event, arg);
+        {
+                struct perf_event *output_event = NULL;
+                int fput_needed = 0;
+                int ret;
+                if (arg != -1) {
+                        output_event = perf_fget_light(arg, &fput_needed);
+                        if (IS_ERR(output_event))
+                                return PTR_ERR(output_event);
+                }
+                ret = perf_event_set_output(event, output_event);
+                if (output_event)
+                        fput_light(output_event->filp, fput_needed);
+                return ret;
+        }
        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);
@@ -2133,11 +2376,6 @@ unlock:
        rcu_read_unlock();
 }
-static unsigned long perf_data_size(struct perf_mmap_data *data)
-{
-        return data->nr_pages << (PAGE_SHIFT + data->data_order);
-}
 #ifndef CONFIG_PERF_USE_VMALLOC
 /*
@@ -2156,6 +2394,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
        return virt_to_page(data->data_pages[pgoff - 1]);
 }
+static void *perf_mmap_alloc_page(int cpu)
+{
+        struct page *page;
+        int node;
+        node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+        if (!page)
+                return NULL;
+        return page_address(page);
+}
 static struct perf_mmap_data *
 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 {
@@ -2163,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        unsigned long size;
        int i;
-        WARN_ON(atomic_read(&event->mmap_count));
        size = sizeof(struct perf_mmap_data);
        size += nr_pages * sizeof(void *);
@@ -2172,17 +2421,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        if (!data)
                goto fail;
-        data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+        data->user_page = perf_mmap_alloc_page(event->cpu);
        if (!data->user_page)
                goto fail_user_page;
        for (i = 0; i < nr_pages; i++) {
-                data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+                data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
                if (!data->data_pages[i])
                        goto fail_data_pages;
        }
-        data->data_order = 0;
        data->nr_pages = nr_pages;
        return data;
@@ -2218,6 +2466,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
        kfree(data);
 }
+static inline int page_order(struct perf_mmap_data *data)
+{
+        return 0;
+}
 #else
 /*
@@ -2226,10 +2479,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
 * Required for architectures that have d-cache aliasing issues.
 */
+static inline int page_order(struct perf_mmap_data *data)
+{
+        return data->page_order;
+}
 static struct page *
 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
 {
-        if (pgoff > (1UL << data->data_order))
+        if (pgoff > (1UL << page_order(data)))
                return NULL;
        return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2249,7 +2507,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
        int i, nr;
        data = container_of(work, struct perf_mmap_data, work);
-        nr = 1 << data->data_order;
+        nr = 1 << page_order(data);
        base = data->user_page;
        for (i = 0; i < nr + 1; i++)
@@ -2271,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        unsigned long size;
        void *all_buf;
-        WARN_ON(atomic_read(&event->mmap_count));
        size = sizeof(struct perf_mmap_data);
        size += sizeof(void *);
@@ -2288,7 +2544,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
        data->user_page = all_buf;
        data->data_pages[0] = all_buf + PAGE_SIZE;
-        data->data_order = ilog2(nr_pages);
+        data->page_order = ilog2(nr_pages);
        data->nr_pages = 1;
        return data;
@@ -2302,6 +2558,11 @@ fail:
 #endif
+static unsigned long perf_data_size(struct perf_mmap_data *data)
+{
+        return data->nr_pages << (PAGE_SHIFT + page_order(data));
+}
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct perf_event *event = vma->vm_file->private_data;
@@ -2342,8 +2603,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
 {
        long max_size = perf_data_size(data);
-        atomic_set(&data->lock, -1);
        if (event->attr.watermark) {
                data->watermark = min_t(long, max_size,
                                        event->attr.wakeup_watermark);
@@ -2352,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
        if (!data->watermark)
                data->watermark = max_size / 2;
+        atomic_set(&data->refcount, 1);
        rcu_assign_pointer(event->data, data);
 }
@@ -2364,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
        perf_mmap_data_free(data);
 }
-static void perf_mmap_data_release(struct perf_event *event)
+static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
 {
-        struct perf_mmap_data *data = event->data;
+        struct perf_mmap_data *data;
+        rcu_read_lock();
+        data = rcu_dereference(event->data);
+        if (data) {
+                if (!atomic_inc_not_zero(&data->refcount))
+                        data = NULL;
+        }
+        rcu_read_unlock();
+        return data;
+}
-        WARN_ON(atomic_read(&event->mmap_count));
+static void perf_mmap_data_put(struct perf_mmap_data *data)
+{
+        if (!atomic_dec_and_test(&data->refcount))
+                return;
-        rcu_assign_pointer(event->data, NULL);
        call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
 }
@@ -2385,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
-        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
                unsigned long size = perf_data_size(event->data);
-                struct user_struct *user = current_user();
+                struct user_struct *user = event->mmap_user;
+                struct perf_mmap_data *data = event->data;
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-                vma->vm_mm->locked_vm -= event->data->nr_locked;
+                vma->vm_mm->locked_vm -= event->mmap_locked;
-                perf_mmap_data_release(event);
+                rcu_assign_pointer(event->data, NULL);
                mutex_unlock(&event->mmap_mutex);
+                perf_mmap_data_put(data);
+                free_uid(user);
        }
 }
@@ -2416,6 +2691,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        long user_extra, extra;
        int ret = 0;
+        /*
+         * Don't allow mmap() of inherited per-task counters. This would
+         * create a performance issue due to all children writing to the
+         * same buffer.
+         */
+        if (event->cpu == -1 && event->attr.inherit)
+                return -EINVAL;
        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;
@@ -2437,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        WARN_ON_ONCE(event->ctx->parent_ctx);
        mutex_lock(&event->mmap_mutex);
-        if (event->output) {
+        if (event->data) {
-                ret = -EINVAL;
+                if (event->data->nr_pages == nr_pages)
-                goto unlock;
+                        atomic_inc(&event->data->refcount);
-        }
+                else
-        if (atomic_inc_not_zero(&event->mmap_count)) {
-                if (nr_pages != event->data->nr_pages)
                        ret = -EINVAL;
                goto unlock;
        }
@@ -2462,7 +2742,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        if (user_locked > user_lock_limit)
                extra = user_locked - user_lock_limit;
-        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = vma->vm_mm->locked_vm + extra;
@@ -2475,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        WARN_ON(event->data);
        data = perf_mmap_data_alloc(event, nr_pages);
-        ret = -ENOMEM;
+        if (!data) {
-        if (!data)
+                ret = -ENOMEM;
                goto unlock;
+        }
-        ret = 0;
        perf_mmap_data_init(event, data);
-        atomic_set(&event->mmap_count, 1);
-        atomic_long_add(user_extra, &user->locked_vm);
-        vma->vm_mm->locked_vm += extra;
-        event->data->nr_locked = extra;
        if (vma->vm_flags & VM_WRITE)
                event->data->writable = 1;
+        atomic_long_add(user_extra, &user->locked_vm);
+        event->mmap_locked = extra;
+        event->mmap_user = get_current_user();
+        vma->vm_mm->locked_vm += event->mmap_locked;
 unlock:
+        if (!ret)
+                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
        vma->vm_flags |= VM_RESERVED;
@@ -2515,6 +2797,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
 }
 static const struct file_operations perf_fops = {
+        .llseek                 = no_llseek,
        .release                = perf_release,
        .read                   = perf_read,
        .poll                   = perf_poll,
@@ -2658,6 +2941,33 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        return NULL;
 }
+__weak
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
+{
+}
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+        perf_guest_cbs = cbs;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+        perf_guest_cbs = NULL;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 /*
 * Output
 */
@@ -2693,127 +3003,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 }
 /*
- * Curious locking construct.
- *
 * We need to ensure a later event_id doesn't publish a head when a former
- * event_id isn't done writing. However since we need to deal with NMIs we
+ * event isn't done writing. However since we need to deal with NMIs we
 * cannot fully serialize things.
 *
- * What we do is serialize between CPUs so we only have to deal with NMI
- * nesting on a single CPU.
- *
 * We only publish the head (and generate a wakeup) when the outer-most
- * event_id completes.
+ * event completes.
 */
-static void perf_output_lock(struct perf_output_handle *handle)
+static void perf_output_get_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-        int cur, cpu = get_cpu();
-        handle->locked = 0;
-        for (;;) {
+        preempt_disable();
-                cur = atomic_cmpxchg(&data->lock, -1, cpu);
+        local_inc(&data->nest);
-                if (cur == -1) {
+        handle->wakeup = local_read(&data->wakeup);
-                        handle->locked = 1;
-                        break;
-                }
-                if (cur == cpu)
-                        break;
-                cpu_relax();
-        }
 }
-static void perf_output_unlock(struct perf_output_handle *handle)
+static void perf_output_put_handle(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
        unsigned long head;
-        int cpu;
-        data->done_head = data->head;
-        if (!handle->locked)
-                goto out;
 again:
-        /*
+        head = local_read(&data->head);
-         * The xchg implies a full barrier that ensures all writes are done
-         * before we publish the new head, matched by a rmb() in userspace when
-         * reading this position.
-         */
-        while ((head = atomic_long_xchg(&data->done_head, 0)))
-                data->user_page->data_head = head;
        /*
-         * NMI can happen here, which means we can miss a done_head update.
+         * IRQ/NMI can happen here, which means we can miss a head update.
         */
-        cpu = atomic_xchg(&data->lock, -1);
+        if (!local_dec_and_test(&data->nest))
-        WARN_ON_ONCE(cpu != smp_processor_id());
+                goto out;
        /*
-         * Therefore we have to validate we did not indeed do so.
+         * Publish the known good head. Rely on the full barrier implied
+         * by atomic_dec_and_test() order the data->head read and this
+         * write.
         */
-        if (unlikely(atomic_long_read(&data->done_head))) {
+        data->user_page->data_head = head;
-                /*
-                 * Since we had it locked, we can lock it again.
-                 */
-                while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
-                        cpu_relax();
+        /*
+         * Now check if we missed an update, rely on the (compiler)
+         * barrier in atomic_dec_and_test() to re-read data->head.
+         */
+        if (unlikely(head != local_read(&data->head))) {
+                local_inc(&data->nest);
                goto again;
        }
-        if (atomic_xchg(&data->wakeup, 0))
+        if (handle->wakeup != local_read(&data->wakeup))
                perf_output_wakeup(handle);
-out:
-        put_cpu();
+ out:
+        preempt_enable();
 }
-void perf_output_copy(struct perf_output_handle *handle,
+__always_inline void perf_output_copy(struct perf_output_handle *handle,
                      const void *buf, unsigned int len)
 {
-        unsigned int pages_mask;
-        unsigned long offset;
-        unsigned int size;
-        void **pages;
-        offset          = handle->offset;
-        pages_mask      = handle->data->nr_pages - 1;
-        pages           = handle->data->data_pages;
        do {
-                unsigned long page_offset;
+                unsigned long size = min_t(unsigned long, handle->size, len);
-                unsigned long page_size;
-                int nr;
-                nr          = (offset >> PAGE_SHIFT) & pages_mask;
+                memcpy(handle->addr, buf, size);
-                page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
-                page_offset = offset & (page_size - 1);
-                size        = min_t(unsigned int, page_size - page_offset, len);
-                memcpy(pages[nr] + page_offset, buf, size);
+                len -= size;
+                handle->addr += size;
+                buf += size;
+                handle->size -= size;
+                if (!handle->size) {
+                        struct perf_mmap_data *data = handle->data;
-                len         -= size;
+                        handle->page++;
-                buf         += size;
+                        handle->page &= data->nr_pages - 1;
-                offset      += size;
+                        handle->addr = data->data_pages[handle->page];
+                        handle->size = PAGE_SIZE << page_order(data);
+                }
        } while (len);
-        handle->offset = offset;
-        /*
-         * Check we didn't copy past our reservation window, taking the
-         * possible unsigned int wrap into account.
-         */
-        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 int perf_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event, unsigned int size,
                      int nmi, int sample)
 {
-        struct perf_event *output_event;
        struct perf_mmap_data *data;
        unsigned long tail, offset, head;
        int have_lost;
@@ -2830,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle,
        if (event->parent)
                event = event->parent;
-        output_event = rcu_dereference(event->output);
-        if (output_event)
-                event = output_event;
        data = rcu_dereference(event->data);
        if (!data)
                goto out;
@@ -2844,13 +3110,13 @@ int perf_output_begin(struct perf_output_handle *handle,
        handle->sample  = sample;
        if (!data->nr_pages)
-                goto fail;
+                goto out;
-        have_lost = atomic_read(&data->lost);
+        have_lost = local_read(&data->lost);
        if (have_lost)
                size += sizeof(lost_event);
-        perf_output_lock(handle);
+        perf_output_get_handle(handle);
        do {
                /*
@@ -2860,24 +3126,28 @@ int perf_output_begin(struct perf_output_handle *handle,
                 */
                tail = ACCESS_ONCE(data->user_page->data_tail);
                smp_rmb();
-                offset = head = atomic_long_read(&data->head);
+                offset = head = local_read(&data->head);
                head += size;
                if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
-        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
+        } while (local_cmpxchg(&data->head, offset, head) != offset);
-        handle->offset  = offset;
+        if (head - local_read(&data->wakeup) > data->watermark)
-        handle->head    = head;
+                local_add(data->watermark, &data->wakeup);
-        if (head - tail > data->watermark)
+        handle->page = offset >> (PAGE_SHIFT + page_order(data));
-                atomic_set(&data->wakeup, 1);
+        handle->page &= data->nr_pages - 1;
+        handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
+        handle->addr = data->data_pages[handle->page];
+        handle->addr += handle->size;
+        handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
        if (have_lost) {
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.header.size = sizeof(lost_event);
                lost_event.id          = event->id;
-                lost_event.lost        = atomic_xchg(&data->lost, 0);
+                lost_event.lost        = local_xchg(&data->lost, 0);
                perf_output_put(handle, lost_event);
        }
@@ -2885,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle,
        return 0;
 fail:
-        atomic_inc(&data->lost);
+        local_inc(&data->lost);
-        perf_output_unlock(handle);
+        perf_output_put_handle(handle);
 out:
        rcu_read_unlock();
@@ -2901,14 +3171,14 @@ void perf_output_end(struct perf_output_handle *handle)
        int wakeup_events = event->attr.wakeup_events;
        if (handle->sample && wakeup_events) {
-                int events = atomic_inc_return(&data->events);
+                int events = local_inc_return(&data->events);
                if (events >= wakeup_events) {
-                        atomic_sub(wakeup_events, &data->events);
+                        local_sub(wakeup_events, &data->events);
-                        atomic_set(&data->wakeup, 1);
+                        local_inc(&data->wakeup);
                }
        }
-        perf_output_unlock(handle);
+        perf_output_put_handle(handle);
        rcu_read_unlock();
 }
@@ -3243,9 +3513,8 @@ static void perf_event_task_output(struct perf_event *event,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-        int size;
        struct task_struct *task = task_event->task;
-        int ret;
+        int size, ret;
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
@@ -3259,8 +3528,6 @@ static void perf_event_task_output(struct perf_event *event,
        task_event->event_id.tid = perf_event_tid(event, task);
        task_event->event_id.ptid = perf_event_tid(event, current);
-        task_event->event_id.time = perf_clock();
        perf_output_put(&handle, task_event->event_id);
        perf_output_end(&handle);
@@ -3268,7 +3535,7 @@ static void perf_event_task_output(struct perf_event *event,
 static int perf_event_task_match(struct perf_event *event)
 {
-        if (event->state != PERF_EVENT_STATE_ACTIVE)
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3300,7 +3567,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_task_ctx(&cpuctx->ctx, task_event);
        if (!ctx)
-                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+                ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_task_ctx(ctx, task_event);
        put_cpu_var(perf_cpu_context);
@@ -3331,6 +3598,7 @@ static void perf_event_task(struct task_struct *task,
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
+                        .time = perf_clock(),
                },
        };
@@ -3380,7 +3648,7 @@ static void perf_event_comm_output(struct perf_event *event,
 static int perf_event_comm_match(struct perf_event *event)
 {
-        if (event->state != PERF_EVENT_STATE_ACTIVE)
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3500,7 +3768,7 @@ static void perf_event_mmap_output(struct perf_event *event,
 static int perf_event_mmap_match(struct perf_event *event,
                                   struct perf_mmap_event *mmap_event)
 {
-        if (event->state != PERF_EVENT_STATE_ACTIVE)
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return 0;
        if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3602,14 +3870,14 @@ void __perf_event_mmap(struct vm_area_struct *vma)
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
-                                .misc = 0,
+                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
-                        .pgoff  = vma->vm_pgoff,
+                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
        };
@@ -3689,12 +3957,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        if (event->attr.freq) {
                u64 now = perf_clock();
-                s64 delta = now - hwc->freq_stamp;
+                s64 delta = now - hwc->freq_time_stamp;
-                hwc->freq_stamp = now;
+                hwc->freq_time_stamp = now;
-                if (delta > 0 && delta < TICK_NSEC)
+                if (delta > 0 && delta < 2*TICK_NSEC)
-                        perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+                        perf_adjust_period(event, delta, hwc->last_period);
        }
        /*
@@ -3790,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        }
 }
-static void perf_swevent_unthrottle(struct perf_event *event)
-{
-        /*
-         * Nothing to do, we already reset hwc->interrupts.
-         */
-}
 static void perf_swevent_add(struct perf_event *event, u64 nr,
                               int nmi, struct perf_sample_data *data,
                               struct pt_regs *regs)
@@ -3820,39 +4081,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        perf_swevent_overflow(event, 0, nmi, data, regs);
 }
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-        /*
-         * The event is active, we're good!
-         */
-        if (event->state == PERF_EVENT_STATE_ACTIVE)
-                return 1;
-        /*
-         * The event is off/error, not counting.
-         */
-        if (event->state != PERF_EVENT_STATE_INACTIVE)
-                return 0;
-        /*
-         * The event is inactive, if the context is active
-         * we're part of a group that didn't make it on the 'pmu',
-         * not counting.
-         */
-        if (event->ctx->is_active)
-                return 0;
-        /*
-         * We're inactive and the context is too, this means the
-         * task is scheduled out, we're counting events that happen
-         * to us, like migration events.
-         */
-        return 1;
-}
-static int perf_tp_event_match(struct perf_event *event,
-                                struct perf_sample_data *data);
 static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
@@ -3873,12 +4101,6 @@ static int perf_swevent_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
-        if (event->cpu != -1 && event->cpu != smp_processor_id())
-                return 0;
-        if (!perf_swevent_is_counting(event))
-                return 0;
        if (event->attr.type != type)
                return 0;
@@ -3888,30 +4110,88 @@ static int perf_swevent_match(struct perf_event *event,
        if (perf_exclude_event(event, regs))
                return 0;
-        if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-            !perf_tp_event_match(event, data))
-                return 0;
        return 1;
 }
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
+static inline u64 swevent_hash(u64 type, u32 event_id)
-                                     enum perf_type_id type,
+{
-                                     u32 event_id, u64 nr, int nmi,
+        u64 val = event_id | (type << 32);
-                                     struct perf_sample_data *data,
-                                     struct pt_regs *regs)
+        return hash_64(val, SWEVENT_HLIST_BITS);
+}
+static inline struct hlist_head *
+__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 {
+        u64 hash = swevent_hash(type, event_id);
+        return &hlist->heads[hash];
+}
+/* For the read side: events when they trigger */
+static inline struct hlist_head *
+find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+        struct swevent_hlist *hlist;
+        hlist = rcu_dereference(ctx->swevent_hlist);
+        if (!hlist)
+                return NULL;
+        return __find_swevent_head(hlist, type, event_id);
+}
+/* For the event head insertion and removal in the hlist */
+static inline struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
+{
+        struct swevent_hlist *hlist;
+        u32 event_id = event->attr.config;
+        u64 type = event->attr.type;
+        /*
+         * Event scheduling is always serialized against hlist allocation
+         * and release. Which makes the protected version suitable here.
+         * The context lock guarantees that.
+         */
+        hlist = rcu_dereference_protected(ctx->swevent_hlist,
+                                          lockdep_is_held(&event->ctx->lock));
+        if (!hlist)
+                return NULL;
+        return __find_swevent_head(hlist, type, event_id);
+}
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+                                    u64 nr, int nmi,
+                                    struct perf_sample_data *data,
+                                    struct pt_regs *regs)
+{
+        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
+        struct hlist_node *node;
+        struct hlist_head *head;
-        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+        cpuctx = &__get_cpu_var(perf_cpu_context);
+        rcu_read_lock();
+        head = find_swevent_head_rcu(cpuctx, type, event_id);
+        if (!head)
+                goto end;
+        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
+end:
+        rcu_read_unlock();
 }
 int perf_swevent_get_recursion_context(void)
 {
-        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        int rctx;
        if (in_nmi())
@@ -3923,10 +4203,8 @@ int perf_swevent_get_recursion_context(void)
        else
                rctx = 0;
-        if (cpuctx->recursion[rctx]) {
+        if (cpuctx->recursion[rctx])
-                put_cpu_var(perf_cpu_context);
                return -1;
-        }
        cpuctx->recursion[rctx]++;
        barrier();
@@ -3940,31 +4218,9 @@ void perf_swevent_put_recursion_context(int rctx)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        barrier();
        cpuctx->recursion[rctx]--;
-        put_cpu_var(perf_cpu_context);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-                                    u64 nr, int nmi,
-                                    struct perf_sample_data *data,
-                                    struct pt_regs *regs)
-{
-        struct perf_cpu_context *cpuctx;
-        struct perf_event_context *ctx;
-        cpuctx = &__get_cpu_var(perf_cpu_context);
-        rcu_read_lock();
-        perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-                                 nr, nmi, data, regs);
-        /*
-         * doesn't really matter which of the child contexts the
-         * events ends up in.
-         */
-        ctx = rcu_dereference(current->perf_event_ctxp);
-        if (ctx)
-                perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-        rcu_read_unlock();
-}
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
@@ -3972,16 +4228,17 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        struct perf_sample_data data;
        int rctx;
+        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (rctx < 0)
                return;
-        data.addr = addr;
+        perf_sample_data_init(&data, addr);
-        data.raw  = NULL;
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
        perf_swevent_put_recursion_context(rctx);
+        preempt_enable_notrace();
 }
 static void perf_swevent_read(struct perf_event *event)
@@ -3991,23 +4248,46 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
+        struct perf_cpu_context *cpuctx;
+        struct hlist_head *head;
+        cpuctx = &__get_cpu_var(perf_cpu_context);
        if (hwc->sample_period) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }
+        head = find_swevent_head(cpuctx, event);
+        if (WARN_ON_ONCE(!head))
+                return -EINVAL;
+        hlist_add_head_rcu(&event->hlist_entry, head);
        return 0;
 }
 static void perf_swevent_disable(struct perf_event *event)
 {
+        hlist_del_rcu(&event->hlist_entry);
+}
+static void perf_swevent_void(struct perf_event *event)
+{
+}
+static int perf_swevent_int(struct perf_event *event)
+{
+        return 0;
 }
 static const struct pmu perf_ops_generic = {
        .enable         = perf_swevent_enable,
        .disable        = perf_swevent_disable,
+        .start          = perf_swevent_int,
+        .stop           = perf_swevent_void,
        .read           = perf_swevent_read,
-        .unthrottle     = perf_swevent_unthrottle,
+        .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
 };
 /*
@@ -4022,22 +4302,14 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        struct perf_event *event;
        u64 period;
-        event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
+        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
        event->pmu->read(event);
-        data.addr = 0;
+        perf_sample_data_init(&data, 0);
-        data.raw = NULL;
        data.period = event->hw.last_period;
        regs = get_irq_regs();
-        /*
-         * In case we exclude kernel IPs or are somehow not in interrupt
-         * context, provide the next best thing, the user IP.
-         */
-        if ((event->attr.exclude_kernel || !regs) &&
-                        !event->attr.exclude_user)
-                regs = task_pt_regs(current);
-        if (regs) {
+        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && current->pid == 0))
                        if (perf_event_overflow(event, 0, &data, regs))
                                ret = HRTIMER_NORESTART;
@@ -4185,33 +4457,124 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
-#ifdef CONFIG_EVENT_PROFILE
+/* Deref the hlist from the update side */
+static inline struct swevent_hlist *
+swevent_hlist_deref(struct perf_cpu_context *cpuctx)
+{
+        return rcu_dereference_protected(cpuctx->swevent_hlist,
+                                         lockdep_is_held(&cpuctx->hlist_mutex));
+}
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
-                          int entry_size)
 {
-        struct perf_raw_record raw = {
+        struct swevent_hlist *hlist;
-                .size = entry_size,
-                .data = record,
-        };
-        struct perf_sample_data data = {
+        hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
-                .addr = addr,
+        kfree(hlist);
-                .raw = &raw,
+}
-        };
-        struct pt_regs *regs = get_irq_regs();
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+        struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
-        if (!regs)
+        if (!hlist)
-                regs = task_pt_regs(current);
+                return;
-        /* Trace events already protected against recursion */
+        rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
-        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+        call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
-                                &data, regs);
 }
-EXPORT_SYMBOL_GPL(perf_tp_event);
-static int perf_tp_event_match(struct perf_event *event,
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (!--cpuctx->hlist_refcount)
+                swevent_hlist_release(cpuctx);
+        mutex_unlock(&cpuctx->hlist_mutex);
+}
+static void swevent_hlist_put(struct perf_event *event)
+{
+        int cpu;
+        if (event->cpu != -1) {
+                swevent_hlist_put_cpu(event, event->cpu);
+                return;
+        }
+        for_each_possible_cpu(cpu)
+                swevent_hlist_put_cpu(event, cpu);
+}
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        int err = 0;
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
+                struct swevent_hlist *hlist;
+                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                if (!hlist) {
+                        err = -ENOMEM;
+                        goto exit;
+                }
+                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+        }
+        cpuctx->hlist_refcount++;
+ exit:
+        mutex_unlock(&cpuctx->hlist_mutex);
+        return err;
+}
+static int swevent_hlist_get(struct perf_event *event)
+{
+        int err;
+        int cpu, failed_cpu;
+        if (event->cpu != -1)
+                return swevent_hlist_get_cpu(event, event->cpu);
+        get_online_cpus();
+        for_each_possible_cpu(cpu) {
+                err = swevent_hlist_get_cpu(event, cpu);
+                if (err) {
+                        failed_cpu = cpu;
+                        goto fail;
+                }
+        }
+        put_online_cpus();
+        return 0;
+ fail:
+        for_each_possible_cpu(cpu) {
+                if (cpu == failed_cpu)
+                        break;
+                swevent_hlist_put_cpu(event, cpu);
+        }
+        put_online_cpus();
+        return err;
+}
+#ifdef CONFIG_EVENT_TRACING
+static const struct pmu perf_ops_tracepoint = {
+        .enable         = perf_trace_enable,
+        .disable        = perf_trace_disable,
+        .start          = perf_swevent_int,
+        .stop           = perf_swevent_void,
+        .read           = perf_swevent_read,
+        .unthrottle     = perf_swevent_void,
+};
+static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
 {
        void *record = data->raw->data;
@@ -4221,13 +4584,55 @@ static int perf_tp_event_match(struct perf_event *event,
        return 0;
 }
+static int perf_tp_event_match(struct perf_event *event,
+                                struct perf_sample_data *data,
+                                struct pt_regs *regs)
+{
+        /*
+         * All tracepoints are from kernel-space.
+         */
+        if (event->attr.exclude_kernel)
+                return 0;
+        if (!perf_tp_filter_match(event, data))
+                return 0;
+        return 1;
+}
+void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+                   struct pt_regs *regs, struct hlist_head *head)
+{
+        struct perf_sample_data data;
+        struct perf_event *event;
+        struct hlist_node *node;
+        struct perf_raw_record raw = {
+                .size = entry_size,
+                .data = record,
+        };
+        perf_sample_data_init(&data, addr);
+        data.raw = &raw;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+                if (perf_tp_event_match(event, &data, regs))
+                        perf_swevent_add(event, count, 1, &data, regs);
+        }
+        rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-        ftrace_profile_disable(event->attr.config);
+        perf_trace_destroy(event);
 }
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+        int err;
        /*
         * Raw tracepoint data is a severe data leak, only allow root to
         * have these.
@@ -4237,12 +4642,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
-        if (ftrace_profile_enable(event->attr.config))
+        err = perf_trace_init(event);
+        if (err)
                return NULL;
        event->destroy = tp_perf_event_destroy;
-        return &perf_ops_generic;
+        return &perf_ops_tracepoint;
 }
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4270,12 +4676,6 @@ static void perf_event_free_filter(struct perf_event *event)
 #else
-static int perf_tp_event_match(struct perf_event *event,
-                                struct perf_sample_data *data)
-{
-        return 1;
-}
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
        return NULL;
@@ -4290,7 +4690,7 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_EVENT_TRACING */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 static void bp_perf_event_destroy(struct perf_event *event)
@@ -4316,8 +4716,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
-        sample.raw = NULL;
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
-        sample.addr = bp->attr.bp_addr;
        if (!perf_exclude_event(bp, regs))
                perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4342,6 +4741,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
        WARN_ON(event->parent);
        atomic_dec(&perf_swevent_enabled[event_id]);
+        swevent_hlist_put(event);
 }
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4380,6 +4780,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
        case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
+                        int err;
+                        err = swevent_hlist_get(event);
+                        if (err)
+                                return ERR_PTR(err);
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
                }
@@ -4580,7 +4986,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->type >= PERF_TYPE_MAX)
                return -EINVAL;
-        if (attr->__reserved_1 || attr->__reserved_2)
+        if (attr->__reserved_1)
                return -EINVAL;
        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4598,54 +5004,53 @@ err_size:
        goto out;
 }
-static int perf_event_set_output(struct perf_event *event, int output_fd)
+static int
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-        struct perf_event *output_event = NULL;
+        struct perf_mmap_data *data = NULL, *old_data = NULL;
-        struct file *output_file = NULL;
-        struct perf_event *old_output;
-        int fput_needed = 0;
        int ret = -EINVAL;
-        if (!output_fd)
+        if (!output_event)
                goto set;
-        output_file = fget_light(output_fd, &fput_needed);
+        /* don't allow circular references */
-        if (!output_file)
+        if (event == output_event)
-                return -EBADF;
-        if (output_file->f_op != &perf_fops)
                goto out;
-        output_event = output_file->private_data;
+        /*
+         * Don't allow cross-cpu buffers
-        /* Don't chain output fds */
+         */
-        if (output_event->output)
+        if (output_event->cpu != event->cpu)
                goto out;
-        /* Don't set an output fd when we already have an output channel */
+        /*
-        if (event->data)
+         * If its not a per-cpu buffer, it must be the same task.
+         */
+        if (output_event->cpu == -1 && output_event->ctx != event->ctx)
                goto out;
-        atomic_long_inc(&output_file->f_count);
 set:
        mutex_lock(&event->mmap_mutex);
-        old_output = event->output;
+        /* Can't redirect output if we've got an active mmap() */
-        rcu_assign_pointer(event->output, output_event);
+        if (atomic_read(&event->mmap_count))
-        mutex_unlock(&event->mmap_mutex);
+                goto unlock;
-        if (old_output) {
+        if (output_event) {
-                /*
+                /* get the buffer we want to redirect to */
-                 * we need to make sure no existing perf_output_*()
+                data = perf_mmap_data_get(output_event);
-                 * is still referencing this event.
+                if (!data)
-                 */
+                        goto unlock;
-                synchronize_rcu();
-                fput(old_output->filp);
        }
+        old_data = event->data;
+        rcu_assign_pointer(event->data, data);
        ret = 0;
+unlock:
+        mutex_unlock(&event->mmap_mutex);
+        if (old_data)
+                perf_mmap_data_put(old_data);
 out:
-        fput_light(output_file, fput_needed);
        return ret;
 }
@@ -4661,13 +5066,13 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
-        struct perf_event *event, *group_leader;
+        struct perf_event *event, *group_leader = NULL, *output_event = NULL;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct file *group_file = NULL;
+        int event_fd;
        int fput_needed = 0;
-        int fput_needed2 = 0;
        int err;
        /* for future expandability... */
@@ -4688,26 +5093,38 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EINVAL;
        }
+        event_fd = get_unused_fd_flags(O_RDWR);
+        if (event_fd < 0)
+                return event_fd;
        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(pid, cpu);
-        if (IS_ERR(ctx))
+        if (IS_ERR(ctx)) {
-                return PTR_ERR(ctx);
+                err = PTR_ERR(ctx);
+                goto err_fd;
+        }
+        if (group_fd != -1) {
+                group_leader = perf_fget_light(group_fd, &fput_needed);
+                if (IS_ERR(group_leader)) {
+                        err = PTR_ERR(group_leader);
+                        goto err_put_context;
+                }
+                group_file = group_leader->filp;
+                if (flags & PERF_FLAG_FD_OUTPUT)
+                        output_event = group_leader;
+                if (flags & PERF_FLAG_FD_NO_GROUP)
+                        group_leader = NULL;
+        }
        /*
         * Look up the group leader (we will attach this event to it):
         */
-        group_leader = NULL;
+        if (group_leader) {
-        if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
                err = -EINVAL;
-                group_file = fget_light(group_fd, &fput_needed);
-                if (!group_file)
-                        goto err_put_context;
-                if (group_file->f_op != &perf_fops)
-                        goto err_put_context;
-                group_leader = group_file->private_data;
                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
@@ -4729,22 +5146,21 @@ SYSCALL_DEFINE5(perf_event_open,
        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
                                     NULL, NULL, GFP_KERNEL);
-        err = PTR_ERR(event);
+        if (IS_ERR(event)) {
-        if (IS_ERR(event))
+                err = PTR_ERR(event);
                goto err_put_context;
+        }
-        err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
+        if (output_event) {
-        if (err < 0)
+                err = perf_event_set_output(event, output_event);
-                goto err_free_put_context;
+                if (err)
+                        goto err_free_put_context;
+        }
-        event_file = fget_light(err, &fput_needed2);
+        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
-        if (!event_file)
+        if (IS_ERR(event_file)) {
+                err = PTR_ERR(event_file);
                goto err_free_put_context;
-        if (flags & PERF_FLAG_FD_OUTPUT) {
-                err = perf_event_set_output(event, group_fd);
-                if (err)
-                        goto err_fput_free_put_context;
        }
        event->filp = event_file;
@@ -4760,19 +5176,23 @@ SYSCALL_DEFINE5(perf_event_open,
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
-err_fput_free_put_context:
+        /*
-        fput_light(event_file, fput_needed2);
+         * Drop the reference on the group_event after placing the
+         * new event on the sibling_list. This ensures destruction
+         * of the group leader will find the pointer to itself in
+         * perf_group_detach().
+         */
+        fput_light(group_file, fput_needed);
+        fd_install(event_fd, event_file);
+        return event_fd;
 err_free_put_context:
-        if (err < 0)
+        free_event(event);
-                kfree(event);
 err_put_context:
-        if (err < 0)
-                put_ctx(ctx);
        fput_light(group_file, fput_needed);
+        put_ctx(ctx);
+err_fd:
+        put_unused_fd(event_fd);
        return err;
 }
@@ -4871,8 +5291,15 @@ inherit_event(struct perf_event *parent_event,
        else
                child_event->state = PERF_EVENT_STATE_OFF;
-        if (parent_event->attr.freq)
+        if (parent_event->attr.freq) {
-                child_event->hw.sample_period = parent_event->hw.sample_period;
+                u64 sample_period = parent_event->hw.sample_period;
+                struct hw_perf_event *hwc = &child_event->hw;
+                hwc->sample_period = sample_period;
+                hwc->last_period   = sample_period;
+                atomic64_set(&hwc->period_left, sample_period);
+        }
        child_event->overflow_handler = parent_event->overflow_handler;
@@ -5037,10 +5464,14 @@ void perf_event_exit_task(struct task_struct *child)
         *
         * But since its the parent context it won't be the same instance.
         */
-        mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+        mutex_lock(&child_ctx->mutex);
 again:
-        list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+        list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
+                                 group_entry)
+                __perf_event_exit_task(child_event, child_ctx, child);
+        list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
                                 group_entry)
                __perf_event_exit_task(child_event, child_ctx, child);
@@ -5049,7 +5480,8 @@ again:
         * its siblings to the list, but we obtained 'tmp' before that which
         * will still point to the list head terminating the iteration.
         */
-        if (!list_empty(&child_ctx->group_list))
+        if (!list_empty(&child_ctx->pinned_groups) ||
+            !list_empty(&child_ctx->flexible_groups))
                goto again;
        mutex_unlock(&child_ctx->mutex);
@@ -5057,6 +5489,25 @@ again:
        put_ctx(child_ctx);
 }
+static void perf_free_event(struct perf_event *event,
+                            struct perf_event_context *ctx)
+{
+        struct perf_event *parent = event->parent;
+        if (WARN_ON_ONCE(!parent))
+                return;
+        mutex_lock(&parent->child_mutex);
+        list_del_init(&event->child_list);
+        mutex_unlock(&parent->child_mutex);
+        fput(parent->filp);
+        perf_group_detach(event);
+        list_del_event(event, ctx);
+        free_event(event);
+}
 /*
 * free an unexposed, unused context as created by inheritance by
 * init_task below, used by fork() in case of fail.
@@ -5071,36 +5522,70 @@ void perf_event_free_task(struct task_struct *task)
        mutex_lock(&ctx->mutex);
 again:
-        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
+        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-                struct perf_event *parent = event->parent;
+                perf_free_event(event, ctx);
-                if (WARN_ON_ONCE(!parent))
+        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                        continue;
+                                 group_entry)
+                perf_free_event(event, ctx);
-                mutex_lock(&parent->child_mutex);
+        if (!list_empty(&ctx->pinned_groups) ||
-                list_del_init(&event->child_list);
+            !list_empty(&ctx->flexible_groups))
-                mutex_unlock(&parent->child_mutex);
+                goto again;
-                fput(parent->filp);
+        mutex_unlock(&ctx->mutex);
-                list_del_event(event, ctx);
+        put_ctx(ctx);
-                free_event(event);
+}
+static int
+inherit_task_group(struct perf_event *event, struct task_struct *parent,
+                   struct perf_event_context *parent_ctx,
+                   struct task_struct *child,
+                   int *inherited_all)
+{
+        int ret;
+        struct perf_event_context *child_ctx = child->perf_event_ctxp;
+        if (!event->attr.inherit) {
+                *inherited_all = 0;
+                return 0;
        }
-        if (!list_empty(&ctx->group_list))
+        if (!child_ctx) {
-                goto again;
+                /*
+                 * This is executed from the parent task context, so
+                 * inherit events that have been marked for cloning.
+                 * First allocate and initialize a context for the
+                 * child.
+                 */
-        mutex_unlock(&ctx->mutex);
+                child_ctx = kzalloc(sizeof(struct perf_event_context),
+                                    GFP_KERNEL);
+                if (!child_ctx)
+                        return -ENOMEM;
-        put_ctx(ctx);
+                __perf_event_init_context(child_ctx, child);
+                child->perf_event_ctxp = child_ctx;
+                get_task_struct(child);
+        }
+        ret = inherit_group(event, parent, parent_ctx,
+                            child, child_ctx);
+        if (ret)
+                *inherited_all = 0;
+        return ret;
 }
 /*
 * Initialize the perf_event context in task_struct
 */
 int perf_event_init_task(struct task_struct *child)
 {
-        struct perf_event_context *child_ctx = NULL, *parent_ctx;
+        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
@@ -5138,41 +5623,22 @@ int perf_event_init_task(struct task_struct *child)
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
-        list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
+        list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+                ret = inherit_task_group(event, parent, parent_ctx, child,
-                if (!event->attr.inherit) {
+                                         &inherited_all);
-                        inherited_all = 0;
+                if (ret)
-                        continue;
+                        break;
-                }
+        }
-                if (!child->perf_event_ctxp) {
-                        /*
-                         * This is executed from the parent task context, so
-                         * inherit events that have been marked for cloning.
-                         * First allocate and initialize a context for the
-                         * child.
-                         */
-                        child_ctx = kzalloc(sizeof(struct perf_event_context),
-                                            GFP_KERNEL);
-                        if (!child_ctx) {
-                                ret = -ENOMEM;
-                                break;
-                        }
-                        __perf_event_init_context(child_ctx, child);
-                        child->perf_event_ctxp = child_ctx;
-                        get_task_struct(child);
-                }
-                ret = inherit_group(event, parent, parent_ctx,
+        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-                                             child, child_ctx);
+                ret = inherit_task_group(event, parent, parent_ctx, child,
-                if (ret) {
+                                         &inherited_all);
-                        inherited_all = 0;
+                if (ret)
                        break;
-                }
        }
+        child_ctx = child->perf_event_ctxp;
        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
@@ -5200,18 +5666,37 @@ int perf_event_init_task(struct task_struct *child)
        return ret;
 }
+static void __init perf_event_init_all_cpus(void)
+{
+        int cpu;
+        struct perf_cpu_context *cpuctx;
+        for_each_possible_cpu(cpu) {
+                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                mutex_init(&cpuctx->hlist_mutex);
+                __perf_event_init_context(&cpuctx->ctx, NULL);
+        }
+}
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
        struct perf_cpu_context *cpuctx;
        cpuctx = &per_cpu(perf_cpu_context, cpu);
-        __perf_event_init_context(&cpuctx->ctx, NULL);
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
-        hw_perf_event_setup(cpu);
+        mutex_lock(&cpuctx->hlist_mutex);
+        if (cpuctx->hlist_refcount > 0) {
+                struct swevent_hlist *hlist;
+                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+                WARN_ON_ONCE(!hlist);
+                rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+        }
+        mutex_unlock(&cpuctx->hlist_mutex);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5221,7 +5706,9 @@ static void __perf_event_exit_cpu(void *info)
        struct perf_event_context *ctx = &cpuctx->ctx;
        struct perf_event *event, *tmp;
-        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+                __perf_event_remove_from_context(event);
+        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                __perf_event_remove_from_context(event);
 }
 static void perf_event_exit_cpu(int cpu)
@@ -5229,6 +5716,10 @@ static void perf_event_exit_cpu(int cpu)
        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
        struct perf_event_context *ctx = &cpuctx->ctx;
+        mutex_lock(&cpuctx->hlist_mutex);
+        swevent_hlist_release(cpuctx);
+        mutex_unlock(&cpuctx->hlist_mutex);
        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
        mutex_unlock(&ctx->mutex);
@@ -5249,11 +5740,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
                perf_event_init_cpu(cpu);
                break;
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                hw_perf_event_setup_online(cpu);
-                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                perf_event_exit_cpu(cpu);
@@ -5276,6 +5762,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 void __init perf_event_init(void)
 {
+        perf_event_init_all_cpus();
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5283,13 +5770,16 @@ void __init perf_event_init(void)
        register_cpu_notifier(&perf_cpu_nb);
 }
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
+                                        struct sysdev_class_attribute *attr,
+                                        char *buf)
 {
        return sprintf(buf, "%d\n", perf_reserved_percpu);
 }
 static ssize_t
 perf_set_reserve_percpu(struct sysdev_class *class,
+                        struct sysdev_class_attribute *attr,
                        const char *buf,
                        size_t count)
 {
@@ -5318,13 +5808,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
        return count;
 }
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_overcommit(struct sysdev_class *class,
+                                    struct sysdev_class_attribute *attr,
+                                    char *buf)
 {
        return sprintf(buf, "%d\n", perf_overcommit);
 }
 static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+perf_set_overcommit(struct sysdev_class *class,
+                    struct sysdev_class_attribute *attr,
+                    const char *buf, size_t count)
 {
        unsigned long val;
        int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..e9fd8c132d26 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
-                first = rcu_dereference(pid->tasks[type].first);
+                first = rcu_dereference_check(pid->tasks[type].first,
+                                              rcu_read_lock_held() ||
+                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
        }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 EXPORT_SYMBOL(pid_task);
 /*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ * Must be called under rcu_read_lock().
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
@@ -511,6 +513,13 @@ void __init pidhash_init(void)
 void __init pidmap_init(void)
 {
+        /* bump default and minimum pid_max based on number of cpus */
+        pid_max = min(pid_max_max, max_t(int, pid_max,
+                                PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+        pid_max_min = max_t(int, pid_max_min,
+                                PIDS_PER_CPU_MIN * num_possible_cpus());
+        pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
        init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
+#include <linux/slab.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rcu_read_lock();
                /*
-                 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+                 * Any nested-container's init processes won't ignore the
-                 * any nested-container's init processes don't ignore the
+                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-                 * signal
                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
                if (task)
-                        force_sig(SIGKILL, task);
+                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
                rcu_read_unlock();
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
 * This module exposes the interface to kernel space for specifying
 * QoS dependencies.  It provides infrastructure for registration of:
 *
- * Dependents on a QoS value : register requirements
+ * Dependents on a QoS value : register requests
 * Watchers of QoS value : get notified when target QoS value changes
 *
 * This QoS design is best effort based.  Dependents register their QoS needs.
@@ -14,19 +14,21 @@
 * timeout: usec <-- currently not used.
 * throughput: kbs (kilo byte / sec)
 *
- * There are lists of pm_qos_objects each one wrapping requirements, notifiers
+ * There are lists of pm_qos_objects each one wrapping requests, notifiers
 *
- * User mode requirements on a QOS parameter register themselves to the
+ * User mode requests on a QOS parameter register themselves to the
 * subsystem by opening the device node /dev/... and writing there request to
 * the node.  As long as the process holds a file handle open to the node the
 * client continues to be accounted for.  Upon file release the usermode
- * requirement is removed and a new qos target is computed.  This way when the
+ * request is removed and a new qos target is computed.  This way when the
- * requirement that the application has is cleaned up when closes the file
+ * request that the application has is cleaned up when closes the file
 * pointer or exits the pm_qos_object will get an opportunity to clean up.
 *
 * Mark Gross <mgross@linux.intel.com>
 */
+/*#define DEBUG*/
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
@@ -42,25 +44,25 @@
 #include <linux/uaccess.h>
 /*
- * locking rule: all changes to requirements or notifiers lists
+ * locking rule: all changes to requests or notifiers lists
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
-struct requirement_list {
+struct pm_qos_request_list {
        struct list_head list;
        union {
                s32 value;
                s32 usec;
                s32 kbps;
        };
-        char *name;
+        int pm_qos_class;
 };
 static s32 max_compare(s32 v1, s32 v2);
 static s32 min_compare(s32 v1, s32 v2);
 struct pm_qos_object {
-        struct requirement_list requirements;
+        struct pm_qos_request_list requests;
        struct blocking_notifier_head *notifiers;
        struct miscdevice pm_qos_power_miscdev;
        char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
 static struct pm_qos_object cpu_dma_pm_qos = {
-        .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
+        .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
        .default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
 static struct pm_qos_object network_lat_pm_qos = {
-        .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
+        .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
        .default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
 static struct pm_qos_object network_throughput_pm_qos = {
-        .requirements =
+        .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
-                {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
        .default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
 }
-static void update_target(int target)
+static void update_target(int pm_qos_class)
 {
        s32 extreme_value;
-        struct requirement_list *node;
+        struct pm_qos_request_list *node;
        unsigned long flags;
        int call_notifier = 0;
        spin_lock_irqsave(&pm_qos_lock, flags);
-        extreme_value = pm_qos_array[target]->default_value;
+        extreme_value = pm_qos_array[pm_qos_class]->default_value;
        list_for_each_entry(node,
-                        &pm_qos_array[target]->requirements.list, list) {
+                        &pm_qos_array[pm_qos_class]->requests.list, list) {
-                extreme_value = pm_qos_array[target]->comparitor(
+                extreme_value = pm_qos_array[pm_qos_class]->comparitor(
                                extreme_value, node->value);
        }
-        if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
+        if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
+                        extreme_value) {
                call_notifier = 1;
-                atomic_set(&pm_qos_array[target]->target_value, extreme_value);
+                atomic_set(&pm_qos_array[pm_qos_class]->target_value,
-                pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
+                                extreme_value);
-                        atomic_read(&pm_qos_array[target]->target_value));
+                pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
+                        atomic_read(&pm_qos_array[pm_qos_class]->target_value));
        }
        spin_unlock_irqrestore(&pm_qos_lock, flags);
        if (call_notifier)
-                blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
+                blocking_notifier_call_chain(
-                        (unsigned long) extreme_value, NULL);
+                                pm_qos_array[pm_qos_class]->notifiers,
+                                        (unsigned long) extreme_value, NULL);
 }
 static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor)
 }
 /**
- * pm_qos_requirement - returns current system wide qos expectation
+ * pm_qos_request - returns current system wide qos expectation
 * @pm_qos_class: identification of which qos value is requested
 *
 * This function returns the current target value in an atomic manner.
 */
-int pm_qos_requirement(int pm_qos_class)
+int pm_qos_request(int pm_qos_class)
 {
        return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
 }
-EXPORT_SYMBOL_GPL(pm_qos_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_request);
 /**
- * pm_qos_add_requirement - inserts new qos request into the list
+ * pm_qos_add_request - inserts new qos request into the list
 * @pm_qos_class: identifies which list of qos request to us
- * @name: identifies the request
 * @value: defines the qos request
 *
 * This function inserts a new entry in the pm_qos_class list of requested qos
 * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters.
+ * for the pm_qos_class of parameters, and returns the pm_qos_request list
+ * element as a handle for use in updating and removal.  Call needs to save
+ * this handle for later use.
 */
-int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
+struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
 {
-        struct requirement_list *dep;
+        struct pm_qos_request_list *dep;
        unsigned long flags;
-        dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
+        dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
        if (dep) {
                if (value == PM_QOS_DEFAULT_VALUE)
                        dep->value = pm_qos_array[pm_qos_class]->default_value;
                else
                        dep->value = value;
-                dep->name = kstrdup(name, GFP_KERNEL);
+                dep->pm_qos_class = pm_qos_class;
-                if (!dep->name)
-                        goto cleanup;
                spin_lock_irqsave(&pm_qos_lock, flags);
                list_add(&dep->list,
-                        &pm_qos_array[pm_qos_class]->requirements.list);
+                        &pm_qos_array[pm_qos_class]->requests.list);
                spin_unlock_irqrestore(&pm_qos_lock, flags);
                update_target(pm_qos_class);
-                return 0;
        }
-cleanup:
+        return dep;
-        kfree(dep);
-        return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_add_request);
 /**
- * pm_qos_update_requirement - modifies an existing qos request
+ * pm_qos_update_request - modifies an existing qos request
- * @pm_qos_class: identifies which list of qos request to us
+ * @pm_qos_req : handle to list element holding a pm_qos request to use
- * @name: identifies the request
 * @value: defines the qos request
 *
- * Updates an existing qos requirement for the pm_qos_class of parameters along
+ * Updates an existing qos request for the pm_qos_class of parameters along
 * with updating the target pm_qos_class value.
 *
- * If the named request isn't in the list then no change is made.
+ * Attempts are made to make this code callable on hot code paths.
 */
-int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
+void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+                s32 new_value)
 {
        unsigned long flags;
-        struct requirement_list *node;
        int pending_update = 0;
+        s32 temp;
-        spin_lock_irqsave(&pm_qos_lock, flags);
+        if (pm_qos_req) { /*guard against callers passing in null */
-        list_for_each_entry(node,
+                spin_lock_irqsave(&pm_qos_lock, flags);
-                &pm_qos_array[pm_qos_class]->requirements.list, list) {
+                if (new_value == PM_QOS_DEFAULT_VALUE)
-                if (strcmp(node->name, name) == 0) {
+                        temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
-                        if (new_value == PM_QOS_DEFAULT_VALUE)
+                else
-                                node->value =
+                        temp = new_value;
-                                pm_qos_array[pm_qos_class]->default_value;
-                        else
+                if (temp != pm_qos_req->value) {
-                                node->value = new_value;
                        pending_update = 1;
-                        break;
+                        pm_qos_req->value = temp;
                }
+                spin_unlock_irqrestore(&pm_qos_lock, flags);
+                if (pending_update)
+                        update_target(pm_qos_req->pm_qos_class);
        }
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        if (pending_update)
-                update_target(pm_qos_class);
-        return 0;
 }
-EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_update_request);
 /**
- * pm_qos_remove_requirement - modifies an existing qos request
+ * pm_qos_remove_request - modifies an existing qos request
- * @pm_qos_class: identifies which list of qos request to us
+ * @pm_qos_req: handle to request list element
- * @name: identifies the request
 *
- * Will remove named qos request from pm_qos_class list of parameters and
+ * Will remove pm qos request from the list of requests and
- * recompute the current target value for the pm_qos_class.
+ * recompute the current target value for the pm_qos_class.  Call this
+ * on slow code paths.
 */
-void pm_qos_remove_requirement(int pm_qos_class, char *name)
+void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
 {
        unsigned long flags;
-        struct requirement_list *node;
+        int qos_class;
-        int pending_update = 0;
+        if (pm_qos_req == NULL)
+                return;
+                /* silent return to keep pcm code cleaner */
+        qos_class = pm_qos_req->pm_qos_class;
        spin_lock_irqsave(&pm_qos_lock, flags);
-        list_for_each_entry(node,
+        list_del(&pm_qos_req->list);
-                &pm_qos_array[pm_qos_class]->requirements.list, list) {
+        kfree(pm_qos_req);
-                if (strcmp(node->name, name) == 0) {
-                        kfree(node->name);
-                        list_del(&node->list);
-                        kfree(node);
-                        pending_update = 1;
-                        break;
-                }
-        }
        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        if (pending_update)
+        update_target(qos_class);
-                update_target(pm_qos_class);
 }
-EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
+EXPORT_SYMBOL_GPL(pm_qos_remove_request);
 /**
 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
 * will register the notifier into a notification chain that gets called
 * upon changes to the pm_qos_class target value.
 */
- int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
+int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
        int retval;
@@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-#define PID_NAME_LEN 32
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
-        int ret;
        long pm_qos_class;
-        char name[PID_NAME_LEN];
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
-                filp->private_data = (void *)pm_qos_class;
+                filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
-                snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
+                                PM_QOS_DEFAULT_VALUE);
-                ret = pm_qos_add_requirement(pm_qos_class, name,
-                                        PM_QOS_DEFAULT_VALUE);
+                if (filp->private_data)
-                if (ret >= 0)
                        return 0;
        }
        return -EPERM;
@@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
-        int pm_qos_class;
+        struct pm_qos_request_list *req;
-        char name[PID_NAME_LEN];
-        pm_qos_class = (long)filp->private_data;
+        req = (struct pm_qos_request_list *)filp->private_data;
-        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
+        pm_qos_remove_request(req);
-        pm_qos_remove_requirement(pm_qos_class, name);
        return 0;
 }
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
        s32 value;
-        int pm_qos_class;
+        int x;
-        char name[PID_NAME_LEN];
+        char ascii_value[11];
+        struct pm_qos_request_list *pm_qos_req;
-        pm_qos_class = (long)filp->private_data;
-        if (count != sizeof(s32))
+        if (count == sizeof(s32)) {
+                if (copy_from_user(&value, buf, sizeof(s32)))
+                        return -EFAULT;
+        } else if (count == 11) { /* len('0x12345678/0') */
+                if (copy_from_user(ascii_value, buf, 11))
+                        return -EFAULT;
+                x = sscanf(ascii_value, "%x", &value);
+                if (x != 1)
+                        return -EINVAL;
+                pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
+        } else
                return -EINVAL;
-        if (copy_from_user(&value, buf, sizeof(s32)))
-                return -EFAULT;
-        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
-        pm_qos_update_requirement(pm_qos_class, name, value);
-        return  sizeof(s32);
+        pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
+        pm_qos_update_request(pm_qos_req, value);
+        return count;
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..9829646d399c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
 #include <trace/events/timer.h>
 /*
- * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ * Called after updating RLIMIT_CPU to run cpu timer and update
+ * tsk->signal->cputime_expires expiration cache if necessary. Needs
+ * siglock protection since other code may update expiration cache as
+ * well.
 */
 void update_rlimit_cpu(unsigned long rlim_new)
 {
        cputime_t cputime = secs_to_cputime(rlim_new);
-        struct signal_struct *const sig = current->signal;
-        if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
+        spin_lock_irq(&current->sighand->siglock);
-            cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
+        set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-                spin_lock_irq(&current->sighand->siglock);
+        spin_unlock_irq(&current->sighand->siglock);
-                set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-                spin_unlock_irq(&current->sighand->siglock);
-        }
 }
 static int check_clock(const clockid_t which_clock)
@@ -364,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
                                }
                        } else {
                                read_lock(&tasklist_lock);
-                                if (thread_group_leader(p) && p->signal) {
+                                if (thread_group_leader(p) && p->sighand) {
                                        error =
                                            cpu_clock_sample_group(which_clock,
                                                                   p, &rtn);
@@ -440,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
        if (likely(p != NULL)) {
                read_lock(&tasklist_lock);
-                if (unlikely(p->signal == NULL)) {
+                if (unlikely(p->sighand == NULL)) {
                        /*
                         * We raced with the reaping of the task.
                         * The deletion should have cleared us off the list.
@@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
               cputime_gt(expires, new_exp);
 }
-static inline int expires_le(cputime_t expires, cputime_t new_exp)
-{
-        return !cputime_eq(expires, cputime_zero) &&
-               cputime_le(expires, new_exp);
-}
 /*
 * Insert the timer on the appropriate list before any timers that
 * expire later.  This must be called with the tasklist_lock held
- * for reading, and interrupts disabled.
+ * for reading, interrupts disabled and p->sighand->siglock taken.
 */
-static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
+static void arm_timer(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
        struct list_head *head, *listpos;
+        struct task_cputime *cputime_expires;
        struct cpu_timer_list *const nt = &timer->it.cpu;
        struct cpu_timer_list *next;
-        unsigned long i;
-        head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
+        if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
-                p->cpu_timers : p->signal->cpu_timers);
+                head = p->cpu_timers;
+                cputime_expires = &p->cputime_expires;
+        } else {
+                head = p->signal->cpu_timers;
+                cputime_expires = &p->signal->cputime_expires;
+        }
        head += CPUCLOCK_WHICH(timer->it_clock);
-        BUG_ON(!irqs_disabled());
-        spin_lock(&p->sighand->siglock);
        listpos = head;
-        if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
+        list_for_each_entry(next, head, entry) {
-                list_for_each_entry(next, head, entry) {
+                if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
-                        if (next->expires.sched > nt->expires.sched)
+                        break;
-                                break;
+                listpos = &next->entry;
-                        listpos = &next->entry;
-                }
-        } else {
-                list_for_each_entry(next, head, entry) {
-                        if (cputime_gt(next->expires.cpu, nt->expires.cpu))
-                                break;
-                        listpos = &next->entry;
-                }
        }
        list_add(&nt->entry, listpos);
        if (listpos == head) {
+                union cpu_time_count *exp = &nt->expires;
                /*
-                 * We are the new earliest-expiring timer.
+                 * We are the new earliest-expiring POSIX 1.b timer, hence
-                 * If we are a thread timer, there can always
+                 * need to update expiration cache. Take into account that
-                 * be a process timer telling us to stop earlier.
+                 * for process timers we share expiration cache with itimers
+                 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
                 */
-                if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
+                switch (CPUCLOCK_WHICH(timer->it_clock)) {
-                        union cpu_time_count *exp = &nt->expires;
+                case CPUCLOCK_PROF:
+                        if (expires_gt(cputime_expires->prof_exp, exp->cpu))
-                        switch (CPUCLOCK_WHICH(timer->it_clock)) {
+                                cputime_expires->prof_exp = exp->cpu;
-                        default:
+                        break;
-                                BUG();
+                case CPUCLOCK_VIRT:
-                        case CPUCLOCK_PROF:
+                        if (expires_gt(cputime_expires->virt_exp, exp->cpu))
-                                if (expires_gt(p->cputime_expires.prof_exp,
+                                cputime_expires->virt_exp = exp->cpu;
-                                               exp->cpu))
+                        break;
-                                        p->cputime_expires.prof_exp = exp->cpu;
+                case CPUCLOCK_SCHED:
-                                break;
+                        if (cputime_expires->sched_exp == 0 ||
-                        case CPUCLOCK_VIRT:
+                            cputime_expires->sched_exp > exp->sched)
-                                if (expires_gt(p->cputime_expires.virt_exp,
+                                cputime_expires->sched_exp = exp->sched;
-                                               exp->cpu))
+                        break;
-                                        p->cputime_expires.virt_exp = exp->cpu;
-                                break;
-                        case CPUCLOCK_SCHED:
-                                if (p->cputime_expires.sched_exp == 0 ||
-                                    p->cputime_expires.sched_exp > exp->sched)
-                                        p->cputime_expires.sched_exp =
-                                                                exp->sched;
-                                break;
-                        }
-                } else {
-                        struct signal_struct *const sig = p->signal;
-                        union cpu_time_count *exp = &timer->it.cpu.expires;
-                        /*
-                         * For a process timer, set the cached expiration time.
-                         */
-                        switch (CPUCLOCK_WHICH(timer->it_clock)) {
-                        default:
-                                BUG();
-                        case CPUCLOCK_VIRT:
-                                if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
-                                               exp->cpu))
-                                        break;
-                                sig->cputime_expires.virt_exp = exp->cpu;
-                                break;
-                        case CPUCLOCK_PROF:
-                                if (expires_le(sig->it[CPUCLOCK_PROF].expires,
-                                               exp->cpu))
-                                        break;
-                                i = sig->rlim[RLIMIT_CPU].rlim_cur;
-                                if (i != RLIM_INFINITY &&
-                                    i <= cputime_to_secs(exp->cpu))
-                                        break;
-                                sig->cputime_expires.prof_exp = exp->cpu;
-                                break;
-                        case CPUCLOCK_SCHED:
-                                sig->cputime_expires.sched_exp = exp->sched;
-                                break;
-                        }
                }
        }
-        spin_unlock(&p->sighand->siglock);
 }
 /*
@@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 */
 static void cpu_timer_fire(struct k_itimer *timer)
 {
-        if (unlikely(timer->sigq == NULL)) {
+        if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+                /*
+                 * User don't want any signal.
+                 */
+                timer->it.cpu.expires.sched = 0;
+        } else if (unlikely(timer->sigq == NULL)) {
                /*
                 * This a special case for clock_nanosleep,
                 * not a normal timer from sys_timer_create.
@@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                        struct itimerspec *new, struct itimerspec *old)
 {
        struct task_struct *p = timer->it.cpu.task;
-        union cpu_time_count old_expires, new_expires, val;
+        union cpu_time_count old_expires, new_expires, old_incr, val;
        int ret;
        if (unlikely(p == NULL)) {
@@ -736,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
        read_lock(&tasklist_lock);
        /*
         * We need the tasklist_lock to protect against reaping that
-         * clears p->signal.  If p has just been reaped, we can no
+         * clears p->sighand.  If p has just been reaped, we can no
         * longer get any information about it at all.
         */
-        if (unlikely(p->signal == NULL)) {
+        if (unlikely(p->sighand == NULL)) {
                read_unlock(&tasklist_lock);
                put_task_struct(p);
                timer->it.cpu.task = NULL;
@@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
        BUG_ON(!irqs_disabled());
        ret = 0;
+        old_incr = timer->it.cpu.incr;
        spin_lock(&p->sighand->siglock);
        old_expires = timer->it.cpu.expires;
        if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                ret = TIMER_RETRY;
        } else
                list_del_init(&timer->it.cpu.entry);
-        spin_unlock(&p->sighand->siglock);
        /*
         * We need to sample the current value to convert the new
@@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                 * disable this firing since we are already reporting
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
+                spin_unlock(&p->sighand->siglock);
                read_unlock(&tasklist_lock);
                goto out;
        }
@@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
         */
        timer->it.cpu.expires = new_expires;
        if (new_expires.sched != 0 &&
-            (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
            cpu_time_before(timer->it_clock, val, new_expires)) {
-                arm_timer(timer, val);
+                arm_timer(timer);
        }
+        spin_unlock(&p->sighand->siglock);
        read_unlock(&tasklist_lock);
        /*
@@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
        timer->it_overrun = -1;
        if (new_expires.sched != 0 &&
-            (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
            !cpu_time_before(timer->it_clock, val, new_expires)) {
                /*
                 * The designated time already passed, so we notify
@@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 out:
        if (old) {
                sample_to_timespec(timer->it_clock,
-                                   timer->it.cpu.incr, &old->it_interval);
+                                   old_incr, &old->it_interval);
        }
        return ret;
 }
@@ -908,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                clear_dead = p->exit_state;
        } else {
                read_lock(&tasklist_lock);
-                if (unlikely(p->signal == NULL)) {
+                if (unlikely(p->sighand == NULL)) {
                        /*
                         * The process has been reaped.
                         * We can't even collect a sample any more.
@@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
                read_unlock(&tasklist_lock);
        }
-        if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-                if (timer->it.cpu.incr.sched == 0 &&
-                    cpu_time_before(timer->it_clock,
-                                    timer->it.cpu.expires, now)) {
-                        /*
-                         * Do-nothing timer expired and has no reload,
-                         * so it's as if it was never set.
-                         */
-                        timer->it.cpu.expires.sched = 0;
-                        itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
-                        return;
-                }
-                /*
-                 * Account for any expirations and reloads that should
-                 * have happened.
-                 */
-                bump_cpu_timer(timer, now);
-        }
        if (unlikely(clear_dead)) {
                /*
                 * We've noticed that the thread is dead, but
@@ -982,6 +918,7 @@ static void check_thread_timers(struct task_struct *tsk,
        int maxfire;
        struct list_head *timers = tsk->cpu_timers;
        struct signal_struct *const sig = tsk->signal;
+        unsigned long soft;
        maxfire = 20;
        tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +967,10 @@ static void check_thread_timers(struct task_struct *tsk,
        /*
         * Check for the special case thread timers.
         */
-        if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
-                unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+        if (soft != RLIM_INFINITY) {
-                unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
                if (hard != RLIM_INFINITY &&
                    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +981,13 @@ static void check_thread_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+                if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
-                        if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+                                soft += USEC_PER_SEC;
-                                sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+                                sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
-                                                                USEC_PER_SEC;
                        }
                        printk(KERN_INFO
                                "RT Watchdog Timeout: %s[%d]\n",
@@ -1060,14 +997,11 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 }
-static void stop_process_timers(struct task_struct *tsk)
+static void stop_process_timers(struct signal_struct *sig)
 {
-        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+        struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
-        if (!cputimer->running)
-                return;
        spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
        spin_unlock_irqrestore(&cputimer->lock, flags);
@@ -1107,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
        }
 }
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:    The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+        if (cputime_eq(cputime->utime, cputime_zero) &&
+            cputime_eq(cputime->stime, cputime_zero) &&
+            cputime->sum_exec_runtime == 0)
+                return 1;
+        return 0;
+}
 /*
 * Check for any per-thread CPU timers that have fired and move them
 * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1121,19 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
        unsigned long long sum_sched_runtime, sched_expires;
        struct list_head *timers = sig->cpu_timers;
        struct task_cputime cputime;
+        unsigned long soft;
-        /*
-         * Don't sample the current process CPU clocks if there are no timers.
-         */
-        if (list_empty(&timers[CPUCLOCK_PROF]) &&
-            cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
-            sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
-            list_empty(&timers[CPUCLOCK_VIRT]) &&
-            cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
-            list_empty(&timers[CPUCLOCK_SCHED])) {
-                stop_process_timers(tsk);
-                return;
-        }
        /*
         * Collect the current process totals.
@@ -1193,11 +1132,13 @@ static void check_process_timers(struct task_struct *tsk,
                         SIGPROF);
        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
                         SIGVTALRM);
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+        if (soft != RLIM_INFINITY) {
                unsigned long psecs = cputime_to_secs(ptime);
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
                cputime_t x;
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+                if (psecs >= hard) {
                        /*
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
@@ -1205,35 +1146,28 @@ static void check_process_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+                if (psecs >= soft) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
-                        if (sig->rlim[RLIMIT_CPU].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_CPU].rlim_max) {
+                                soft++;
-                                sig->rlim[RLIMIT_CPU].rlim_cur++;
+                                sig->rlim[RLIMIT_CPU].rlim_cur = soft;
                        }
                }
-                x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+                x = secs_to_cputime(soft);
                if (cputime_eq(prof_expires, cputime_zero) ||
                    cputime_lt(x, prof_expires)) {
                        prof_expires = x;
                }
        }
-        if (!cputime_eq(prof_expires, cputime_zero) &&
+        sig->cputime_expires.prof_exp = prof_expires;
-            (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
+        sig->cputime_expires.virt_exp = virt_expires;
-             cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
+        sig->cputime_expires.sched_exp = sched_expires;
-                sig->cputime_expires.prof_exp = prof_expires;
+        if (task_cputime_zero(&sig->cputime_expires))
-        if (!cputime_eq(virt_expires, cputime_zero) &&
+                stop_process_timers(sig);
-            (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
-             cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
-                sig->cputime_expires.virt_exp = virt_expires;
-        if (sched_expires != 0 &&
-            (sig->cputime_expires.sched_exp == 0 ||
-             sig->cputime_expires.sched_exp > sched_expires))
-                sig->cputime_expires.sched_exp = sched_expires;
 }
 /*
@@ -1262,9 +1196,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                        goto out;
                }
                read_lock(&tasklist_lock); /* arm_timer needs it.  */
+                spin_lock(&p->sighand->siglock);
        } else {
                read_lock(&tasklist_lock);
-                if (unlikely(p->signal == NULL)) {
+                if (unlikely(p->sighand == NULL)) {
                        /*
                         * The process has been reaped.
                         * We can't even collect a sample any more.
@@ -1282,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                        clear_dead_task(timer, now);
                        goto out_unlock;
                }
+                spin_lock(&p->sighand->siglock);
                cpu_timer_sample_group(timer->it_clock, p, &now);
                bump_cpu_timer(timer, now);
                /* Leave the tasklist_lock locked for the call below.  */
@@ -1290,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
        /*
         * Now re-arm for the new expiry time.
         */
-        arm_timer(timer, now);
+        BUG_ON(!irqs_disabled());
+        arm_timer(timer);
+        spin_unlock(&p->sighand->siglock);
 out_unlock:
        read_unlock(&tasklist_lock);
@@ -1302,23 +1240,6 @@ out:
 }
 /**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime:    The struct to compare.
- *
- * Checks @cputime to see if all fields are zero.  Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
-        if (cputime_eq(cputime->utime, cputime_zero) &&
-            cputime_eq(cputime->stime, cputime_zero) &&
-            cputime->sum_exec_runtime == 0)
-                return 1;
-        return 0;
-}
-/**
 * task_cputime_expired - Compare two task_cputime entities.
 *
 * @sample:     The task_cputime structure to be checked for expiration.
@@ -1374,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        }
        sig = tsk->signal;
-        if (!task_cputime_zero(&sig->cputime_expires)) {
+        if (sig->cputimer.running) {
                struct task_cputime group_sample;
                thread_group_cputimer(tsk, &group_sample);
@@ -1382,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
                        return 1;
        }
-        return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
+        return 0;
 }
 /*
@@ -1411,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
         * put them on the firing list.
         */
        check_thread_timers(tsk, &firing);
-        check_process_timers(tsk, &firing);
+        /*
+         * If there are any active process wide timers (POSIX 1.b, itimers,
+         * RLIMIT_CPU) cputimer must be running.
+         */
+        if (tsk->signal->cputimer.running)
+                check_process_timers(tsk, &firing);
        /*
         * We must release these locks before taking any timer's lock.
@@ -1448,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 }
 /*
- * Set one of the process-wide special case CPU timers.
+ * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
 * The tsk->sighand->siglock must be held by the caller.
- * The *newval argument is relative and we update it to be absolute, *oldval
- * is absolute and we update it to be relative.
 */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                           cputime_t *newval, cputime_t *oldval)
 {
        union cpu_time_count now;
-        struct list_head *head;
        BUG_ON(clock_idx == CPUCLOCK_SCHED);
        cpu_timer_sample_group(clock_idx, tsk, &now);
        if (oldval) {
+                /*
+                 * We are setting itimer. The *oldval is absolute and we update
+                 * it to be relative, *newval argument is relative and we update
+                 * it to be absolute.
+                 */
                if (!cputime_eq(*oldval, cputime_zero)) {
                        if (cputime_le(*oldval, now.cpu)) {
                                /* Just about to fire. */
@@ -1475,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                if (cputime_eq(*newval, cputime_zero))
                        return;
                *newval = cputime_add(*newval, now.cpu);
-                /*
-                 * If the RLIMIT_CPU timer will expire before the
-                 * ITIMER_PROF timer, we have nothing else to do.
-                 */
-                if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
-                    < cputime_to_secs(*newval))
-                        return;
        }
        /*
-         * Check whether there are any process timers already set to fire
+         * Update expiration cache if we are the earliest timer, or eventually
-         * before this one.  If so, we don't have anything more to do.
+         * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
         */
-        head = &tsk->signal->cpu_timers[clock_idx];
+        switch (clock_idx) {
-        if (list_empty(head) ||
+        case CPUCLOCK_PROF:
-            cputime_ge(list_first_entry(head,
+                if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
-                                  struct cpu_timer_list, entry)->expires.cpu,
-                       *newval)) {
-                switch (clock_idx) {
-                case CPUCLOCK_PROF:
                        tsk->signal->cputime_expires.prof_exp = *newval;
-                        break;
+                break;
-                case CPUCLOCK_VIRT:
+        case CPUCLOCK_VIRT:
+                if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
                        tsk->signal->cputime_expires.virt_exp = *newval;
-                        break;
+                break;
-                }
        }
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..ad723420acc3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
        return 0;
 }
-int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
 {
        *tp = ktime_to_timespec(KTIME_LOW_RES);
        return 0;
@@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
        new_timer->it_id = (timer_t) new_timer_id;
        new_timer->it_clock = which_clock;
        new_timer->it_overrun = -1;
-        error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
-        if (error)
-                goto out;
-        /*
-         * return the timer_id now.  The next step is hard to
-         * back out if there is an error.
-         */
        if (copy_to_user(created_timer_id,
                         &new_timer_id, sizeof (new_timer_id))) {
                error = -EFAULT;
@@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
        new_timer->sigq->info.si_tid   = new_timer->it_id;
        new_timer->sigq->info.si_code  = SI_TIMER;
+        error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+        if (error)
+                goto out;
        spin_lock_irq(&current->sighand->siglock);
        new_timer->it_signal = current->signal;
        list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..ca6066a6952e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting PM bugs, like
        suspend support.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        default n
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
 config PM_VERBOSE
        bool "Verbose Power Management debugging"
        depends on PM_DEBUG
@@ -85,9 +94,18 @@ config PM_SLEEP
        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
+config PM_SLEEP_ADVANCED_DEBUG
+        bool
+        depends on PM_ADVANCED_DEBUG
+        default n
+config SUSPEND_NVS
+       bool
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
+        select SUSPEND_NVS if HAS_IOMEM
        default y
        ---help---
          Allow the system to enter sleep states in which main memory is
@@ -116,13 +134,10 @@ config SUSPEND_FREEZER
          Turning OFF this setting is NOT recommended! If in doubt, say Y.
-config HIBERNATION_NVS
-        bool
 config HIBERNATION
        bool "Hibernation (aka 'suspend to disk')"
        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
-        select HIBERNATION_NVS if HAS_IOMEM
+        select SUSPEND_NVS if HAS_IOMEM
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
@@ -222,3 +237,8 @@ config PM_RUNTIME
          and the bus type drivers of the buses the devices are on are
          responsible for the actual handling of the autosuspend requests and
          wake-up events.
+config PM_OPS
+        bool
+        depends on PM_SLEEP || PM_RUNTIME
+        default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..f9063c6b185d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)           += process.o
 obj-$(CONFIG_SUSPEND)           += suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
-obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o \
-obj-$(CONFIG_HIBERNATION_NVS)   += hibernate_nvs.o
+                                   block_io.o
+obj-$(CONFIG_SUSPEND_NVS)       += nvs.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
+/*
+ * This file provides functions for block I/O operations on swap/file.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/bio.h>
+#include <linux/kernel.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include "power.h"
+/**
+ *      submit - submit BIO request.
+ *      @rw:    READ or WRITE.
+ *      @off    physical offset of page.
+ *      @page:  page we're reading or writing.
+ *      @bio_chain: list of pending biod (for async reading)
+ *
+ *      Straight from the textbook - allocate and initialize the bio.
+ *      If we're reading, make sure the page is marked as dirty.
+ *      Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, struct block_device *bdev, sector_t sector,
+                struct page *page, struct bio **bio_chain)
+{
+        const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+        struct bio *bio;
+        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+        bio->bi_sector = sector;
+        bio->bi_bdev = bdev;
+        bio->bi_end_io = end_swap_bio_read;
+        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+                printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+                        (unsigned long long)sector);
+                bio_put(bio);
+                return -EFAULT;
+        }
+        lock_page(page);
+        bio_get(bio);
+        if (bio_chain == NULL) {
+                submit_bio(bio_rw, bio);
+                wait_on_page_locked(page);
+                if (rw == READ)
+                        bio_set_pages_dirty(bio);
+                bio_put(bio);
+        } else {
+                if (rw == READ)
+                        get_page(page); /* These pages are freed later */
+                bio->bi_private = *bio_chain;
+                *bio_chain = bio;
+                submit_bio(bio_rw, bio);
+        }
+        return 0;
+}
+int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+        return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+                        virt_to_page(addr), bio_chain);
+}
+int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+        return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
+                        virt_to_page(addr), bio_chain);
+}
+int hib_wait_on_bio_chain(struct bio **bio_chain)
+{
+        struct bio *bio;
+        struct bio *next_bio;
+        int ret = 0;
+        if (bio_chain == NULL)
+                return 0;
+        bio = *bio_chain;
+        if (bio == NULL)
+                return 0;
+        while (bio) {
+                struct page *page;
+                next_bio = bio->bi_private;
+                page = bio->bi_io_vec[0].bv_page;
+                wait_on_page_locked(page);
+                if (!PageUptodate(page) || PageError(page))
+                        ret = -EIO;
+                put_page(page);
+                bio_put(bio);
+                bio = next_bio;
+        }
+        *bio_chain = NULL;
+        return ret;
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/gfp.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
@@ -323,6 +324,7 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -334,6 +336,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -351,6 +354,7 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -445,14 +449,17 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
        pm_restore_console();
        return error;
@@ -466,6 +473,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
+        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -481,6 +489,7 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -518,6 +527,7 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
                        == NOTIFY_BAD) ? -EINVAL : 0;
 }
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+{
+        return sprintf(buf, "%d\n", pm_async_enabled);
+}
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+                              const char *buf, size_t n)
+{
+        unsigned long val;
+        if (strict_strtoul(buf, 10, &val))
+                return -EINVAL;
+        if (val > 1)
+                return -EINVAL;
+        pm_async_enabled = val;
+        return n;
+}
+power_attr(pm_async);
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
 #endif
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+#ifdef CONFIG_PM_SLEEP
+        &pm_async_attr.attr,
+#ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#endif
        NULL,
 };
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index 39ac698ef836..1836db60bbb6 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -10,11 +10,12 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/suspend.h>
 /*
 * Platforms, like ACPI, may want us to save some memory used by them during
- * hibernation and to restore the contents of this memory during the subsequent
+ * suspend and to restore the contents of this memory during the subsequent
 * resume.  The code below implements a mechanism allowing us to do that.
 */
@@ -29,7 +30,7 @@ struct nvs_page {
 static LIST_HEAD(nvs_list);
 /**
- *      hibernate_nvs_register - register platform NVS memory region to save
+ *      suspend_nvs_register - register platform NVS memory region to save
 *      @start - physical address of the region
 *      @size - size of the region
 *
@@ -37,7 +38,7 @@ static LIST_HEAD(nvs_list);
 *      things so that the data from page-aligned addresses in this region will
 *      be copied into separate RAM pages.
 */
-int hibernate_nvs_register(unsigned long start, unsigned long size)
+int suspend_nvs_register(unsigned long start, unsigned long size)
 {
        struct nvs_page *entry, *next;
@@ -67,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
 }
 /**
- *      hibernate_nvs_free - free data pages allocated for saving NVS regions
+ *      suspend_nvs_free - free data pages allocated for saving NVS regions
 */
-void hibernate_nvs_free(void)
+void suspend_nvs_free(void)
 {
        struct nvs_page *entry;
@@ -85,16 +86,16 @@ void hibernate_nvs_free(void)
 }
 /**
- *      hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ *      suspend_nvs_alloc - allocate memory necessary for saving NVS regions
 */
-int hibernate_nvs_alloc(void)
+int suspend_nvs_alloc(void)
 {
        struct nvs_page *entry;
        list_for_each_entry(entry, &nvs_list, node) {
                entry->data = (void *)__get_free_page(GFP_KERNEL);
                if (!entry->data) {
-                        hibernate_nvs_free();
+                        suspend_nvs_free();
                        return -ENOMEM;
                }
        }
@@ -102,9 +103,9 @@ int hibernate_nvs_alloc(void)
 }
 /**
- *      hibernate_nvs_save - save NVS memory regions
+ *      suspend_nvs_save - save NVS memory regions
 */
-void hibernate_nvs_save(void)
+void suspend_nvs_save(void)
 {
        struct nvs_page *entry;
@@ -118,12 +119,12 @@ void hibernate_nvs_save(void)
 }
 /**
- *      hibernate_nvs_restore - restore NVS memory regions
+ *      suspend_nvs_restore - restore NVS memory regions
 *
 *      This function is going to be called with interrupts disabled, so it
 *      cannot iounmap the virtual addresses used to access the NVS region.
 */
-void hibernate_nvs_restore(void)
+void suspend_nvs_restore(void)
 {
        struct nvs_page *entry;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
 */
 struct snapshot_handle {
-        loff_t          offset; /* number of the last byte ready for reading
-                                 * or writing in the sequence
-                                 */
        unsigned int    cur;    /* number of the block of PAGE_SIZE bytes the
                                 * next operation will refer to (ie. current)
                                 */
-        unsigned int    cur_offset;     /* offset with respect to the current
-                                         * block (for the next operation)
-                                         */
-        unsigned int    prev;   /* number of the block of PAGE_SIZE bytes that
-                                 * was the current one previously
-                                 */
        void            *buffer;        /* address of the block to read from
                                         * or write to
                                         */
-        unsigned int    buf_offset;     /* location to read from or write to,
-                                         * given as a displacement from 'buffer'
-                                         */
        int             sync_read;      /* Set to one to notify the caller of
                                         * snapshot_write_next() that it may
                                         * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
 * snapshot_read_next()/snapshot_write_next() is allowed to
 * read/write data after the function returns
 */
-#define data_of(handle) ((handle).buffer + (handle).buf_offset)
+#define data_of(handle) ((handle).buffer)
 extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern unsigned long snapshot_get_image_size(void);
-extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_read_next(struct snapshot_handle *handle);
-extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle);
 extern void snapshot_write_finalize(struct snapshot_handle *handle);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(fmode_t);
+/* kernel/power/block_io.c */
+extern struct block_device *hib_resume_bdev;
+extern int hib_bio_read_page(pgoff_t page_off, void *addr,
+                struct bio **bio_chain);
+extern int hib_bio_write_page(pgoff_t page_off, void *addr,
+                struct bio **bio_chain);
+extern int hib_wait_on_bio_chain(struct bio **bio_chain);
 struct timeval;
 /* kernel/power/swsusp.c */
 extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf366..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
                printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
                                "(%d tasks refusing to freeze):\n",
                                elapsed_csecs / 100, elapsed_csecs % 100, todo);
-                show_state();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
                        if (freezing(p) && !freezer_should_skip(p))
-                                printk(KERN_ERR " %s\n", p->comm);
+                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
                } while_each_thread(g, p);
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
                if (nosig_only && should_send_signal(p))
                        continue;
-                if (cgroup_frozen(p))
+                if (cgroup_freezing_or_frozen(p))
                        continue;
                thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
 #include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1181,7 +1182,7 @@ static void free_unnecessary_pages(void)
        memory_bm_position_reset(&copy_bm);
-        while (to_free_normal > 0 && to_free_highmem > 0) {
+        while (to_free_normal > 0 || to_free_highmem > 0) {
                unsigned long pfn = memory_bm_next_pfn(&copy_bm);
                struct page *page = pfn_to_page(pfn);
@@ -1500,7 +1501,7 @@ asmlinkage int swsusp_save(void)
 {
        unsigned int nr_pages, nr_highmem;
-        printk(KERN_INFO "PM: Creating hibernation image: \n");
+        printk(KERN_INFO "PM: Creating hibernation image:\n");
        drain_local_pages(NULL);
        nr_pages = count_data_pages();
@@ -1603,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 *      snapshot_handle structure.  The structure gets updated and a pointer
 *      to it should be passed to this function every next time.
 *
- *      The @count parameter should contain the number of bytes the caller
- *      wants to read from the snapshot.  It must not be zero.
- *
 *      On success the function returns a positive number.  Then, the caller
 *      is allowed to read up to the returned number of bytes from the memory
- *      location computed by the data_of() macro.  The number returned
+ *      location computed by the data_of() macro.
- *      may be smaller than @count, but this only happens if the read would
- *      cross a page boundary otherwise.
 *
 *      The function returns 0 to indicate the end of data stream condition,
 *      and a negative number is returned on error.  In such cases the
@@ -1618,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 *      any more.
 */
-int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+int snapshot_read_next(struct snapshot_handle *handle)
 {
        if (handle->cur > nr_meta_pages + nr_copy_pages)
                return 0;
@@ -1629,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
                if (!buffer)
                        return -ENOMEM;
        }
-        if (!handle->offset) {
+        if (!handle->cur) {
                int error;
                error = init_header((struct swsusp_info *)buffer);
@@ -1638,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
                handle->buffer = buffer;
                memory_bm_position_reset(&orig_bm);
                memory_bm_position_reset(&copy_bm);
-        }
+        } else if (handle->cur <= nr_meta_pages) {
-        if (handle->prev < handle->cur) {
+                memset(buffer, 0, PAGE_SIZE);
-                if (handle->cur <= nr_meta_pages) {
+                pack_pfns(buffer, &orig_bm);
-                        memset(buffer, 0, PAGE_SIZE);
+        } else {
-                        pack_pfns(buffer, &orig_bm);
+                struct page *page;
-                } else {
-                        struct page *page;
-                        page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+                page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
-                        if (PageHighMem(page)) {
+                if (PageHighMem(page)) {
-                                /* Highmem pages are copied to the buffer,
+                        /* Highmem pages are copied to the buffer,
-                                 * because we can't return with a kmapped
+                         * because we can't return with a kmapped
-                                 * highmem page (we may not be called again).
+                         * highmem page (we may not be called again).
-                                 */
+                         */
-                                void *kaddr;
+                        void *kaddr;
-                                kaddr = kmap_atomic(page, KM_USER0);
+                        kaddr = kmap_atomic(page, KM_USER0);
-                                memcpy(buffer, kaddr, PAGE_SIZE);
+                        memcpy(buffer, kaddr, PAGE_SIZE);
-                                kunmap_atomic(kaddr, KM_USER0);
+                        kunmap_atomic(kaddr, KM_USER0);
-                                handle->buffer = buffer;
+                        handle->buffer = buffer;
-                        } else {
+                } else {
-                                handle->buffer = page_address(page);
+                        handle->buffer = page_address(page);
-                        }
                }
-                handle->prev = handle->cur;
-        }
-        handle->buf_offset = handle->cur_offset;
-        if (handle->cur_offset + count >= PAGE_SIZE) {
-                count = PAGE_SIZE - handle->cur_offset;
-                handle->cur_offset = 0;
-                handle->cur++;
-        } else {
-                handle->cur_offset += count;
        }
-        handle->offset += count;
+        handle->cur++;
-        return count;
+        return PAGE_SIZE;
 }
 /**
@@ -2132,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 *      snapshot_handle structure.  The structure gets updated and a pointer
 *      to it should be passed to this function every next time.
 *
- *      The @count parameter should contain the number of bytes the caller
- *      wants to write to the image.  It must not be zero.
- *
 *      On success the function returns a positive number.  Then, the caller
 *      is allowed to write up to the returned number of bytes to the memory
- *      location computed by the data_of() macro.  The number returned
+ *      location computed by the data_of() macro.
- *      may be smaller than @count, but this only happens if the write would
- *      cross a page boundary otherwise.
 *
 *      The function returns 0 to indicate the "end of file" condition,
 *      and a negative number is returned on error.  In such cases the
@@ -2147,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 *      any more.
 */
-int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+int snapshot_write_next(struct snapshot_handle *handle)
 {
        static struct chain_allocator ca;
        int error = 0;
        /* Check if we have already loaded the entire image */
-        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+        if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
                return 0;
-        if (handle->offset == 0) {
+        handle->sync_read = 1;
+        if (!handle->cur) {
                if (!buffer)
                        /* This makes the buffer be freed by swsusp_free() */
                        buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2165,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
                        return -ENOMEM;
                handle->buffer = buffer;
-        }
+        } else if (handle->cur == 1) {
-        handle->sync_read = 1;
+                error = load_header(buffer);
-        if (handle->prev < handle->cur) {
+                if (error)
-                if (handle->prev == 0) {
+                        return error;
-                        error = load_header(buffer);
-                        if (error)
-                                return error;
-                        error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
+                error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
-                        if (error)
+                if (error)
-                                return error;
+                        return error;
+        } else if (handle->cur <= nr_meta_pages + 1) {
+                error = unpack_orig_pfns(buffer, &copy_bm);
+                if (error)
+                        return error;
-                } else if (handle->prev <= nr_meta_pages) {
+                if (handle->cur == nr_meta_pages + 1) {
-                        error = unpack_orig_pfns(buffer, &copy_bm);
+                        error = prepare_image(&orig_bm, &copy_bm);
                        if (error)
                                return error;
-                        if (handle->prev == nr_meta_pages) {
+                        chain_init(&ca, GFP_ATOMIC, PG_SAFE);
-                                error = prepare_image(&orig_bm, &copy_bm);
+                        memory_bm_position_reset(&orig_bm);
-                                if (error)
+                        restore_pblist = NULL;
-                                        return error;
-                                chain_init(&ca, GFP_ATOMIC, PG_SAFE);
-                                memory_bm_position_reset(&orig_bm);
-                                restore_pblist = NULL;
-                                handle->buffer = get_buffer(&orig_bm, &ca);
-                                handle->sync_read = 0;
-                                if (IS_ERR(handle->buffer))
-                                        return PTR_ERR(handle->buffer);
-                        }
-                } else {
-                        copy_last_highmem_page();
                        handle->buffer = get_buffer(&orig_bm, &ca);
+                        handle->sync_read = 0;
                        if (IS_ERR(handle->buffer))
                                return PTR_ERR(handle->buffer);
-                        if (handle->buffer != buffer)
-                                handle->sync_read = 0;
                }
-                handle->prev = handle->cur;
-        }
-        handle->buf_offset = handle->cur_offset;
-        if (handle->cur_offset + count >= PAGE_SIZE) {
-                count = PAGE_SIZE - handle->cur_offset;
-                handle->cur_offset = 0;
-                handle->cur++;
        } else {
-                handle->cur_offset += count;
+                copy_last_highmem_page();
+                handle->buffer = get_buffer(&orig_bm, &ca);
+                if (IS_ERR(handle->buffer))
+                        return PTR_ERR(handle->buffer);
+                if (handle->buffer != buffer)
+                        handle->sync_read = 0;
        }
-        handle->offset += count;
+        handle->cur++;
-        return count;
+        return PAGE_SIZE;
 }
 /**
@@ -2229,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
 {
        copy_last_highmem_page();
        /* Free only if we have loaded the image entirely */
-        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+        if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
                free_highmem_data();
        }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..f37cb7dd4402 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,13 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
 #include "power.h"
@@ -189,6 +196,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
+        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
@@ -199,6 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -215,6 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,11 +23,46 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
+#include <linux/slab.h>
 #include "power.h"
 #define SWSUSP_SIG      "S1SUSPEND"
+/*
+ *      The swap map is a data structure used for keeping track of each page
+ *      written to a swap partition.  It consists of many swap_map_page
+ *      structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *      These structures are stored on the swap and linked together with the
+ *      help of the .next_swap member.
+ *
+ *      The swap map is created during suspend.  The swap map pages are
+ *      allocated and populated one at a time, so we only need one memory
+ *      page to set up the entire structure.
+ *
+ *      During resume we also only need to use one swap_map_page structure
+ *      at a time.
+ */
+#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(sector_t) - 1)
+struct swap_map_page {
+        sector_t entries[MAP_PAGE_ENTRIES];
+        sector_t next_swap;
+};
+/**
+ *      The swap_map_handle structure is used for handling swap in
+ *      a file-alike way
+ */
+struct swap_map_handle {
+        struct swap_map_page *cur;
+        sector_t cur_swap;
+        sector_t first_sector;
+        unsigned int k;
+};
 struct swsusp_header {
        char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
        sector_t image;
@@ -144,110 +179,24 @@ int swsusp_swap_in_use(void)
 */
 static unsigned short root_swap = 0xffff;
-static struct block_device *resume_bdev;
+struct block_device *hib_resume_bdev;
-/**
- *      submit - submit BIO request.
- *      @rw:    READ or WRITE.
- *      @off    physical offset of page.
- *      @page:  page we're reading or writing.
- *      @bio_chain: list of pending biod (for async reading)
- *
- *      Straight from the textbook - allocate and initialize the bio.
- *      If we're reading, make sure the page is marked as dirty.
- *      Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
-                        struct bio **bio_chain)
-{
-        const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
-        struct bio *bio;
-        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-        bio->bi_bdev = resume_bdev;
-        bio->bi_end_io = end_swap_bio_read;
-        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
-                        page_off);
-                bio_put(bio);
-                return -EFAULT;
-        }
-        lock_page(page);
-        bio_get(bio);
-        if (bio_chain == NULL) {
-                submit_bio(bio_rw, bio);
-                wait_on_page_locked(page);
-                if (rw == READ)
-                        bio_set_pages_dirty(bio);
-                bio_put(bio);
-        } else {
-                if (rw == READ)
-                        get_page(page); /* These pages are freed later */
-                bio->bi_private = *bio_chain;
-                *bio_chain = bio;
-                submit_bio(bio_rw, bio);
-        }
-        return 0;
-}
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-        return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-        return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
-}
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
-        struct bio *bio;
-        struct bio *next_bio;
-        int ret = 0;
-        if (bio_chain == NULL)
-                return 0;
-        bio = *bio_chain;
-        if (bio == NULL)
-                return 0;
-        while (bio) {
-                struct page *page;
-                next_bio = bio->bi_private;
-                page = bio->bi_io_vec[0].bv_page;
-                wait_on_page_locked(page);
-                if (!PageUptodate(page) || PageError(page))
-                        ret = -EIO;
-                put_page(page);
-                bio_put(bio);
-                bio = next_bio;
-        }
-        *bio_chain = NULL;
-        return ret;
-}
 /*
 * Saving part
 */
-static int mark_swapfiles(sector_t start, unsigned int flags)
+static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
        int error;
-        bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+        hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
        if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
                memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
                memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
-                swsusp_header->image = start;
+                swsusp_header->image = handle->first_sector;
                swsusp_header->flags = flags;
-                error = bio_write_page(swsusp_resume_block,
+                error = hib_bio_write_page(swsusp_resume_block,
                                        swsusp_header, NULL);
        } else {
                printk(KERN_ERR "PM: Swap header not found!\n");
@@ -259,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
 /**
 *      swsusp_swap_check - check if the resume device is a swap device
 *      and get its index (if so)
+ *
+ *      This is called before saving image
 */
+static int swsusp_swap_check(void)
-static int swsusp_swap_check(void) /* This is called before saving image */
 {
        int res;
        res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
-                        &resume_bdev);
+                        &hib_resume_bdev);
        if (res < 0)
                return res;
        root_swap = res;
-        res = blkdev_get(resume_bdev, FMODE_WRITE);
+        res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
        if (res)
                return res;
-        res = set_blocksize(resume_bdev, PAGE_SIZE);
+        res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
        if (res < 0)
-                blkdev_put(resume_bdev, FMODE_WRITE);
+                blkdev_put(hib_resume_bdev, FMODE_WRITE);
        return res;
 }
@@ -308,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
        } else {
                src = buf;
        }
-        return bio_write_page(offset, src, bio_chain);
+        return hib_bio_write_page(offset, src, bio_chain);
 }
-/*
- *      The swap map is a data structure used for keeping track of each page
- *      written to a swap partition.  It consists of many swap_map_page
- *      structures that contain each an array of MAP_PAGE_SIZE swap entries.
- *      These structures are stored on the swap and linked together with the
- *      help of the .next_swap member.
- *
- *      The swap map is created during suspend.  The swap map pages are
- *      allocated and populated one at a time, so we only need one memory
- *      page to set up the entire structure.
- *
- *      During resume we also only need to use one swap_map_page structure
- *      at a time.
- */
-#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(sector_t) - 1)
-struct swap_map_page {
-        sector_t entries[MAP_PAGE_ENTRIES];
-        sector_t next_swap;
-};
-/**
- *      The swap_map_handle structure is used for handling swap in
- *      a file-alike way
- */
-struct swap_map_handle {
-        struct swap_map_page *cur;
-        sector_t cur_swap;
-        unsigned int k;
-};
 static void release_swap_writer(struct swap_map_handle *handle)
 {
        if (handle->cur)
@@ -353,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
 static int get_swap_writer(struct swap_map_handle *handle)
 {
+        int ret;
+        ret = swsusp_swap_check();
+        if (ret) {
+                if (ret != -ENOSPC)
+                        printk(KERN_ERR "PM: Cannot find swap device, try "
+                                        "swapon -a.\n");
+                return ret;
+        }
        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
-        if (!handle->cur)
+        if (!handle->cur) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto err_close;
+        }
        handle->cur_swap = alloc_swapdev_block(root_swap);
        if (!handle->cur_swap) {
-                release_swap_writer(handle);
+                ret = -ENOSPC;
-                return -ENOSPC;
+                goto err_rel;
        }
        handle->k = 0;
+        handle->first_sector = handle->cur_swap;
        return 0;
+err_rel:
+        release_swap_writer(handle);
+err_close:
+        swsusp_close(FMODE_WRITE);
+        return ret;
 }
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -379,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                return error;
        handle->cur->entries[handle->k++] = offset;
        if (handle->k >= MAP_PAGE_ENTRIES) {
-                error = wait_on_bio_chain(bio_chain);
+                error = hib_wait_on_bio_chain(bio_chain);
                if (error)
                        goto out;
                offset = alloc_swapdev_block(root_swap);
@@ -405,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
                return -EINVAL;
 }
+static int swap_writer_finish(struct swap_map_handle *handle,
+                unsigned int flags, int error)
+{
+        if (!error) {
+                flush_swap_writer(handle);
+                printk(KERN_INFO "PM: S");
+                error = mark_swapfiles(handle, flags);
+                printk("|\n");
+        }
+        if (error)
+                free_all_swap_pages(root_swap);
+        release_swap_writer(handle);
+        swsusp_close(FMODE_WRITE);
+        return error;
+}
 /**
 *      save_image - save the suspend image data
 */
@@ -430,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
        while (1) {
-                ret = snapshot_read_next(snapshot, PAGE_SIZE);
+                ret = snapshot_read_next(snapshot);
                if (ret <= 0)
                        break;
                ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -440,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
                        printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
                nr_pages++;
        }
-        err2 = wait_on_bio_chain(&bio);
+        err2 = hib_wait_on_bio_chain(&bio);
        do_gettimeofday(&stop);
        if (!ret)
                ret = err2;
@@ -482,50 +434,34 @@ int swsusp_write(unsigned int flags)
        struct swap_map_handle handle;
        struct snapshot_handle snapshot;
        struct swsusp_info *header;
+        unsigned long pages;
        int error;
-        error = swsusp_swap_check();
+        pages = snapshot_get_image_size();
+        error = get_swap_writer(&handle);
        if (error) {
-                printk(KERN_ERR "PM: Cannot find swap device, try "
+                printk(KERN_ERR "PM: Cannot get swap writer\n");
-                                "swapon -a.\n");
                return error;
        }
+        if (!enough_swap(pages)) {
+                printk(KERN_ERR "PM: Not enough free swap\n");
+                error = -ENOSPC;
+                goto out_finish;
+        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
-        error = snapshot_read_next(&snapshot, PAGE_SIZE);
+        error = snapshot_read_next(&snapshot);
        if (error < PAGE_SIZE) {
                if (error >= 0)
                        error = -EFAULT;
-                goto out;
+                goto out_finish;
        }
        header = (struct swsusp_info *)data_of(snapshot);
-        if (!enough_swap(header->pages)) {
+        error = swap_write_page(&handle, header, NULL);
-                printk(KERN_ERR "PM: Not enough free swap\n");
+        if (!error)
-                error = -ENOSPC;
+                error = save_image(&handle, &snapshot, pages - 1);
-                goto out;
+out_finish:
-        }
+        error = swap_writer_finish(&handle, flags, error);
-        error = get_swap_writer(&handle);
-        if (!error) {
-                sector_t start = handle.cur_swap;
-                error = swap_write_page(&handle, header, NULL);
-                if (!error)
-                        error = save_image(&handle, &snapshot,
-                                        header->pages - 1);
-                if (!error) {
-                        flush_swap_writer(&handle);
-                        printk(KERN_INFO "PM: S");
-                        error = mark_swapfiles(start, flags);
-                        printk("|\n");
-                }
-        }
-        if (error)
-                free_all_swap_pages(root_swap);
-        release_swap_writer(&handle);
- out:
-        swsusp_close(FMODE_WRITE);
        return error;
 }
@@ -541,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
        handle->cur = NULL;
 }
-static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
+static int get_swap_reader(struct swap_map_handle *handle,
+                unsigned int *flags_p)
 {
        int error;
-        if (!start)
+        *flags_p = swsusp_header->flags;
+        if (!swsusp_header->image) /* how can this happen? */
                return -EINVAL;
        handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
        if (!handle->cur)
                return -ENOMEM;
-        error = bio_read_page(start, handle->cur, NULL);
+        error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
        if (error) {
                release_swap_reader(handle);
                return error;
@@ -572,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
        offset = handle->cur->entries[handle->k];
        if (!offset)
                return -EFAULT;
-        error = bio_read_page(offset, buf, bio_chain);
+        error = hib_bio_read_page(offset, buf, bio_chain);
        if (error)
                return error;
        if (++handle->k >= MAP_PAGE_ENTRIES) {
-                error = wait_on_bio_chain(bio_chain);
+                error = hib_wait_on_bio_chain(bio_chain);
                handle->k = 0;
                offset = handle->cur->next_swap;
                if (!offset)
                        release_swap_reader(handle);
                else if (!error)
-                        error = bio_read_page(offset, handle->cur, NULL);
+                        error = hib_bio_read_page(offset, handle->cur, NULL);
        }
        return error;
 }
+static int swap_reader_finish(struct swap_map_handle *handle)
+{
+        release_swap_reader(handle);
+        return 0;
+}
 /**
 *      load_image - load the image using the swap map handle
 *      @handle and the snapshot handle @snapshot
@@ -614,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
        for ( ; ; ) {
-                error = snapshot_write_next(snapshot, PAGE_SIZE);
+                error = snapshot_write_next(snapshot);
                if (error <= 0)
                        break;
                error = swap_read_page(handle, data_of(*snapshot), &bio);
                if (error)
                        break;
                if (snapshot->sync_read)
-                        error = wait_on_bio_chain(&bio);
+                        error = hib_wait_on_bio_chain(&bio);
                if (error)
                        break;
                if (!(nr_pages % m))
                        printk("\b\b\b\b%3d%%", nr_pages / m);
                nr_pages++;
        }
-        err2 = wait_on_bio_chain(&bio);
+        err2 = hib_wait_on_bio_chain(&bio);
        do_gettimeofday(&stop);
        if (!error)
                error = err2;
@@ -656,24 +602,20 @@ int swsusp_read(unsigned int *flags_p)
        struct snapshot_handle snapshot;
        struct swsusp_info *header;
-        *flags_p = swsusp_header->flags;
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("PM: Image device not initialised\n");
-                return PTR_ERR(resume_bdev);
-        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
-        error = snapshot_write_next(&snapshot, PAGE_SIZE);
+        error = snapshot_write_next(&snapshot);
        if (error < PAGE_SIZE)
                return error < 0 ? error : -EFAULT;
        header = (struct swsusp_info *)data_of(snapshot);
-        error = get_swap_reader(&handle, swsusp_header->image);
+        error = get_swap_reader(&handle, flags_p);
+        if (error)
+                goto end;
        if (!error)
                error = swap_read_page(&handle, header, NULL);
        if (!error)
                error = load_image(&handle, &snapshot, header->pages - 1);
-        release_swap_reader(&handle);
+        swap_reader_finish(&handle);
+end:
        if (!error)
                pr_debug("PM: Image successfully loaded\n");
        else
@@ -689,11 +631,11 @@ int swsusp_check(void)
 {
        int error;
-        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+        hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
-        if (!IS_ERR(resume_bdev)) {
+        if (!IS_ERR(hib_resume_bdev)) {
-                set_blocksize(resume_bdev, PAGE_SIZE);
+                set_blocksize(hib_resume_bdev, PAGE_SIZE);
                memset(swsusp_header, 0, PAGE_SIZE);
-                error = bio_read_page(swsusp_resume_block,
+                error = hib_bio_read_page(swsusp_resume_block,
                                        swsusp_header, NULL);
                if (error)
                        goto put;
@@ -701,7 +643,7 @@ int swsusp_check(void)
                if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
                        memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
                        /* Reset swap signature now */
-                        error = bio_write_page(swsusp_resume_block,
+                        error = hib_bio_write_page(swsusp_resume_block,
                                                swsusp_header, NULL);
                } else {
                        error = -EINVAL;
@@ -709,11 +651,11 @@ int swsusp_check(void)
 put:
                if (error)
-                        blkdev_put(resume_bdev, FMODE_READ);
+                        blkdev_put(hib_resume_bdev, FMODE_READ);
                else
                        pr_debug("PM: Signature found, resuming\n");
        } else {
-                error = PTR_ERR(resume_bdev);
+                error = PTR_ERR(hib_resume_bdev);
        }
        if (error)
@@ -728,12 +670,12 @@ put:
 void swsusp_close(fmode_t mode)
 {
-        if (IS_ERR(resume_bdev)) {
+        if (IS_ERR(hib_resume_bdev)) {
                pr_debug("PM: Image device not initialised\n");
                return;
        }
-        blkdev_put(resume_bdev, mode);
+        blkdev_put(hib_resume_bdev, mode);
 }
 static int swsusp_header_init(void)
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/kernel/power/swsusp.c
- *
- * This file provides code to write suspend image to swap and read it back.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- * I'd like to thank the following people for their work:
- *
- * Pavel Machek <pavel@ucw.cz>:
- * Modifications, defectiveness pointing, being with me at the very beginning,
- * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
- *
- * Steve Doddi <dirk@loth.demon.co.uk>:
- * Support the possibility of hardware state restoring.
- *
- * Raph <grey.havens@earthling.net>:
- * Support for preserving states of network devices and virtual console
- * (including X and svgatextmode)
- *
- * Kurt Garloff <garloff@suse.de>:
- * Straightened the critical function in order to prevent compilers from
- * playing tricks with local variables.
- *
- * Andreas Mohr <a.mohr@mailto.de>
- *
- * Alex Badea <vampire@go.ro>:
- * Fixed runaway init
- *
- * Rafael J. Wysocki <rjw@sisk.pl>
- * Reworked the freeing of memory and the handling of swap
- *
- * More state savers are welcome. Especially for the scsi layer...
- *
- * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
- */
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
-#include <linux/pm.h>
-#include <linux/swapops.h>
-#include <linux/bootmem.h>
-#include <linux/syscalls.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/rbtree.h>
-#include <linux/io.h>
-#include "power.h"
-int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 {
        struct snapshot_data *data;
        ssize_t res;
+        loff_t pg_offp = *offp & ~PAGE_MASK;
        mutex_lock(&pm_mutex);
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
                res = -ENODATA;
                goto Unlock;
        }
-        res = snapshot_read_next(&data->handle, count);
+        if (!pg_offp) { /* on page boundary? */
-        if (res > 0) {
+                res = snapshot_read_next(&data->handle);
-                if (copy_to_user(buf, data_of(data->handle), res))
+                if (res <= 0)
-                        res = -EFAULT;
+                        goto Unlock;
-                else
+        } else {
-                        *offp = data->handle.offset;
+                res = PAGE_SIZE - pg_offp;
        }
+        res = simple_read_from_buffer(buf, count, &pg_offp,
+                        data_of(data->handle), res);
+        if (res > 0)
+                *offp += res;
 Unlock:
        mutex_unlock(&pm_mutex);
@@ -178,23 +184,39 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 {
        struct snapshot_data *data;
        ssize_t res;
+        loff_t pg_offp = *offp & ~PAGE_MASK;
        mutex_lock(&pm_mutex);
        data = filp->private_data;
-        res = snapshot_write_next(&data->handle, count);
-        if (res > 0) {
+        if (!pg_offp) {
-                if (copy_from_user(data_of(data->handle), buf, res))
+                res = snapshot_write_next(&data->handle);
-                        res = -EFAULT;
+                if (res <= 0)
-                else
+                        goto unlock;
-                        *offp = data->handle.offset;
+        } else {
+                res = PAGE_SIZE - pg_offp;
        }
+        res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
+                        buf, count);
+        if (res > 0)
+                *offp += res;
+unlock:
        mutex_unlock(&pm_mutex);
        return res;
 }
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+        if (printk_ratelimit())
+                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+                                "be removed soon, update your suspend-to-disk "
+                                "utilities\n",
+                                __builtin_return_address(0), cmd);
+}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -246,8 +268,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->frozen = 0;
                break;
-        case SNAPSHOT_CREATE_IMAGE:
        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
                        break;
@@ -275,8 +298,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->ready = 0;
                break;
-        case SNAPSHOT_PREF_IMAGE_SIZE:
        case SNAPSHOT_SET_IMAGE_SIZE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -290,15 +314,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_AVAIL_SWAP_SIZE:
        case SNAPSHOT_AVAIL_SWAP:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_ALLOC_SWAP_PAGE:
        case SNAPSHOT_GET_SWAP_PAGE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
                        break;
@@ -321,6 +347,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                if (!swsusp_swap_in_use()) {
                        /*
                         * User space encodes device types as two-byte values,
@@ -362,6 +389,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                error = -EINVAL;
                switch (arg) {
@@ -405,7 +433,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                         * User space encodes device types as two-byte values,
                         * so we need to recode them
                         */
-                        swdev = old_decode_dev(swap_area.dev);
+                        swdev = new_decode_dev(swap_area.dev);
                        if (swdev) {
                                offset = swap_area.offset;
                                data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..444b770c9595 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,8 +33,10 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
+#include <linux/kdb.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
 #include <asm/uaccess.h>
@@ -69,8 +71,6 @@ int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
 };
-static int saved_console_loglevel = -1;
 /*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
@@ -145,6 +145,7 @@ static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+static int saved_console_loglevel = -1;
 #ifdef CONFIG_KEXEC
 /*
@@ -258,38 +259,23 @@ static inline void boot_delay_msec(void)
 }
 #endif
-/*
+int do_syslog(int type, char __user *buf, int len, bool from_file)
- * Commands to do_syslog:
- *
- *      0 -- Close the log.  Currently a NOP.
- *      1 -- Open the log. Currently a NOP.
- *      2 -- Read from the log.
- *      3 -- Read all messages remaining in the ring buffer.
- *      4 -- Read and clear all messages remaining in the ring buffer
- *      5 -- Clear ring buffer.
- *      6 -- Disable printk's to console
- *      7 -- Enable printk's to console
- *      8 -- Set level of messages printed to console
- *      9 -- Return number of unread characters in the log buffer
- *     10 -- Return size of the log buffer
- */
-int do_syslog(int type, char __user *buf, int len)
 {
        unsigned i, j, limit, count;
        int do_clear = 0;
        char c;
        int error = 0;
-        error = security_syslog(type);
+        error = security_syslog(type, from_file);
        if (error)
                return error;
        switch (type) {
-        case 0:         /* Close log */
+        case SYSLOG_ACTION_CLOSE:       /* Close log */
                break;
-        case 1:         /* Open log */
+        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
-        case 2:         /* Read from log */
+        case SYSLOG_ACTION_READ:        /* Read from log */
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -320,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len)
                if (!error)
                        error = i;
                break;
-        case 4:         /* Read/clear last kernel messages */
+        /* Read/clear last kernel messages */
+        case SYSLOG_ACTION_READ_CLEAR:
                do_clear = 1;
                /* FALL THRU */
-        case 3:         /* Read last kernel messages */
+        /* Read last kernel messages */
+        case SYSLOG_ACTION_READ_ALL:
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -376,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len)
                        }
                }
                break;
-        case 5:         /* Clear ring buffer */
+        /* Clear ring buffer */
+        case SYSLOG_ACTION_CLEAR:
                logged_chars = 0;
                break;
-        case 6:         /* Disable logging to console */
+        /* Disable logging to console */
+        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == -1)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
-        case 7:         /* Enable logging to console */
+        /* Enable logging to console */
+        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != -1) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = -1;
                }
                break;
-        case 8:         /* Set level of messages printed to console */
+        /* Set level of messages printed to console */
+        case SYSLOG_ACTION_CONSOLE_LEVEL:
                error = -EINVAL;
                if (len < 1 || len > 8)
                        goto out;
@@ -401,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len)
                saved_console_loglevel = -1;
                error = 0;
                break;
-        case 9:         /* Number of chars in the log buffer */
+        /* Number of chars in the log buffer */
+        case SYSLOG_ACTION_SIZE_UNREAD:
                error = log_end - log_start;
                break;
-        case 10:        /* Size of the log buffer */
+        /* Size of the log buffer */
+        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
@@ -417,9 +411,25 @@ out:
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-        return do_syslog(type, buf, len);
+        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
+#ifdef  CONFIG_KGDB_KDB
+/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
+ * uses locks so it cannot be used during debugging.  Just tell kdb
+ * where the start and end of the physical and logical logs are.  This
+ * is equivalent to do_syslog(3).
+ */
+void kdb_syslog_data(char *syslog_data[4])
+{
+        syslog_data[0] = log_buf;
+        syslog_data[1] = log_buf + log_buf_len;
+        syslog_data[2] = log_buf + log_end -
+                (logged_chars < log_buf_len ? logged_chars : log_buf_len);
+        syslog_data[3] = log_buf + log_end;
+}
+#endif  /* CONFIG_KGDB_KDB */
 /*
 * Call the console drivers on a range of log_buf
 */
@@ -593,6 +603,14 @@ asmlinkage int printk(const char *fmt, ...)
        va_list args;
        int r;
+#ifdef CONFIG_KGDB_KDB
+        if (unlikely(kdb_trap_printk)) {
+                va_start(args, fmt);
+                r = vkdb_printf(fmt, args);
+                va_end(args);
+                return r;
+        }
+#endif
        va_start(args, fmt);
        r = vprintk(fmt, args);
        va_end(args);
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
                return 0;
        prof_buffer = vmalloc(buffer_bytes);
-        if (prof_buffer)
+        if (prof_buffer) {
+                memset(prof_buffer, 0, buffer_bytes);
                return 0;
+        }
        free_cpumask_var(prof_cpu_mask);
        return -ENOMEM;
@@ -363,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                node = cpu_to_node(cpu);
+                node = cpu_to_mem(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
                        page = alloc_pages_exact_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
-                                return NOTIFY_BAD;
+                                return notifier_from_errno(-ENOMEM);
                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
                }
                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -386,7 +388,7 @@ out_free:
                page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
                per_cpu(cpu_profile_hits, cpu)[1] = NULL;
                __free_page(page);
-                return NOTIFY_BAD;
+                return notifier_from_errno(-ENOMEM);
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                if (prof_cpu_mask != NULL)
@@ -565,7 +567,7 @@ static int create_hash_tables(void)
        int cpu;
        for_each_online_cpu(cpu) {
-                int node = cpu_to_node(cpu);
+                int node = cpu_to_mem(cpu);
                struct page *page;
                page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..74a3d693c196 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
@@ -22,6 +21,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/regset.h>
 /*
@@ -75,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
-        arch_ptrace_untrace(child);
        if (task_is_traced(child))
                ptrace_untrace(child);
 }
@@ -511,6 +510,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
        return 0;
 }
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static const struct user_regset *
+find_regset(const struct user_regset_view *view, unsigned int type)
+{
+        const struct user_regset *regset;
+        int n;
+        for (n = 0; n < view->n; ++n) {
+                regset = view->regsets + n;
+                if (regset->core_note_type == type)
+                        return regset;
+        }
+        return NULL;
+}
+static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
+                         struct iovec *kiov)
+{
+        const struct user_regset_view *view = task_user_regset_view(task);
+        const struct user_regset *regset = find_regset(view, type);
+        int regset_no;
+        if (!regset || (kiov->iov_len % regset->size) != 0)
+                return -EINVAL;
+        regset_no = regset - view->regsets;
+        kiov->iov_len = min(kiov->iov_len,
+                            (__kernel_size_t) (regset->n * regset->size));
+        if (req == PTRACE_GETREGSET)
+                return copy_regset_to_user(task, view, regset_no, 0,
+                                           kiov->iov_len, kiov->iov_base);
+        else
+                return copy_regset_from_user(task, view, regset_no, 0,
+                                             kiov->iov_len, kiov->iov_base);
+}
+#endif
 int ptrace_request(struct task_struct *child, long request,
                   long addr, long data)
 {
@@ -554,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request,
                ret = ptrace_detach(child, data);
                break;
+#ifdef CONFIG_BINFMT_ELF_FDPIC
+        case PTRACE_GETFDPIC: {
+                struct mm_struct *mm = get_task_mm(child);
+                unsigned long tmp = 0;
+                ret = -ESRCH;
+                if (!mm)
+                        break;
+                switch (addr) {
+                case PTRACE_GETFDPIC_EXEC:
+                        tmp = mm->context.exec_fdpic_loadmap;
+                        break;
+                case PTRACE_GETFDPIC_INTERP:
+                        tmp = mm->context.interp_fdpic_loadmap;
+                        break;
+                default:
+                        break;
+                }
+                mmput(mm);
+                ret = put_user(tmp, (unsigned long __user *) data);
+                break;
+        }
+#endif
 #ifdef PTRACE_SINGLESTEP
        case PTRACE_SINGLESTEP:
 #endif
@@ -573,6 +639,26 @@ int ptrace_request(struct task_struct *child, long request,
                        return 0;
                return ptrace_resume(child, request, SIGKILL);
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct iovec __user *uiov = (struct iovec __user *) data;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(kiov.iov_base, &uiov->iov_base) ||
+                    __get_user(kiov.iov_len, &uiov->iov_len))
+                        return -EFAULT;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                break;
        }
@@ -604,10 +690,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
        struct task_struct *child;
        long ret;
-        /*
-         * This lock_kernel fixes a subtle race with suid exec
-         */
-        lock_kernel();
        if (request == PTRACE_TRACEME) {
                ret = ptrace_traceme();
                if (!ret)
@@ -641,7 +723,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 out_put_task_struct:
        put_task_struct(child);
 out:
-        unlock_kernel();
        return ret;
 }
@@ -711,6 +792,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                else
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct compat_iovec __user *uiov =
+                        (struct compat_iovec __user *) datap;
+                compat_uptr_t ptr;
+                compat_size_t len;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(ptr, &uiov->iov_base) ||
+                    __get_user(len, &uiov->iov_len))
+                        return -EFAULT;
+                kiov.iov_base = compat_ptr(ptr);
+                kiov.iov_len = len;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                ret = ptrace_request(child, request, addr, data);
@@ -725,10 +832,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        struct task_struct *child;
        long ret;
-        /*
-         * This lock_kernel fixes a subtle race with suid exec
-         */
-        lock_kernel();
        if (request == PTRACE_TRACEME) {
                ret = ptrace_traceme();
                goto out;
@@ -758,7 +861,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 out_put_task_struct:
        put_task_struct(child);
 out:
-        unlock_kernel();
        return ret;
 }
 #endif  /* CONFIG_COMPAT */
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/range.h>
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+        if (start >= end)
+                return nr_range;
+        /* Out of slots: */
+        if (nr_range >= az)
+                return nr_range;
+        range[nr_range].start = start;
+        range[nr_range].end = end;
+        nr_range++;
+        return nr_range;
+}
+int add_range_with_merge(struct range *range, int az, int nr_range,
+                     u64 start, u64 end)
+{
+        int i;
+        if (start >= end)
+                return nr_range;
+        /* Try to merge it with old one: */
+        for (i = 0; i < nr_range; i++) {
+                u64 final_start, final_end;
+                u64 common_start, common_end;
+                if (!range[i].end)
+                        continue;
+                common_start = max(range[i].start, start);
+                common_end = min(range[i].end, end);
+                if (common_start > common_end)
+                        continue;
+                final_start = min(range[i].start, start);
+                final_end = max(range[i].end, end);
+                range[i].start = final_start;
+                range[i].end =  final_end;
+                return nr_range;
+        }
+        /* Need to add it: */
+        return add_range(range, az, nr_range, start, end);
+}
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+        int i, j;
+        if (start >= end)
+                return;
+        for (j = 0; j < az; j++) {
+                if (!range[j].end)
+                        continue;
+                if (start <= range[j].start && end >= range[j].end) {
+                        range[j].start = 0;
+                        range[j].end = 0;
+                        continue;
+                }
+                if (start <= range[j].start && end < range[j].end &&
+                    range[j].start < end) {
+                        range[j].start = end;
+                        continue;
+                }
+                if (start > range[j].start && end >= range[j].end &&
+                    range[j].end > start) {
+                        range[j].end = start;
+                        continue;
+                }
+                if (start > range[j].start && end < range[j].end) {
+                        /* Find the new spare: */
+                        for (i = 0; i < az; i++) {
+                                if (range[i].end == 0)
+                                        break;
+                        }
+                        if (i < az) {
+                                range[i].end = range[j].end;
+                                range[i].start = end;
+                        } else {
+                                printk(KERN_ERR "run of slot in ranges\n");
+                        }
+                        range[j].end = start;
+                        continue;
+                }
+        }
+}
+static int cmp_range(const void *x1, const void *x2)
+{
+        const struct range *r1 = x1;
+        const struct range *r2 = x2;
+        s64 start1, start2;
+        start1 = r1->start;
+        start2 = r2->start;
+        return start1 - start2;
+}
+int clean_sort_range(struct range *range, int az)
+{
+        int i, j, k = az - 1, nr_range = 0;
+        for (i = 0; i < k; i++) {
+                if (range[i].end)
+                        continue;
+                for (j = k; j > i; j--) {
+                        if (range[j].end) {
+                                k = j;
+                                break;
+                        }
+                }
+                if (j == i)
+                        break;
+                range[i].start = range[k].start;
+                range[i].end   = range[k].end;
+                range[k].start = 0;
+                range[k].end   = 0;
+                k--;
+        }
+        /* count it */
+        for (i = 0; i < az; i++) {
+                if (!range[i].end) {
+                        nr_range = i;
+                        break;
+                }
+        }
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+        return nr_range;
+}
+void sort_range(struct range *range, int nr_range)
+{
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,54 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
 EXPORT_SYMBOL_GPL(rcu_lock_map);
+static struct lock_class_key rcu_bh_lock_key;
+struct lockdep_map rcu_bh_lock_map =
+        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
+EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
+static struct lock_class_key rcu_sched_lock_key;
+struct lockdep_map rcu_sched_lock_map =
+        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
+EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 #endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int debug_lockdep_rcu_enabled(void)
+{
+        return rcu_scheduler_active && debug_locks &&
+               current->lockdep_recursion == 0;
+}
+EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+/**
+ * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ *
+ * Check for bottom half being disabled, which covers both the
+ * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
+ * will show the situation.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ */
+int rcu_read_lock_bh_held(void)
+{
+        if (!debug_lockdep_rcu_enabled())
+                return 1;
+        return in_softirq();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
 * Awaken the corresponding synchronize_rcu() instance now that a
 * grace period has elapsed.
@@ -63,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head  *head)
        rcu = container_of(head, struct rcu_synchronize, head);
        complete(&rcu->completion);
 }
+#ifdef CONFIG_PROVE_RCU
+/*
+ * wrapper function to avoid #include problems.
+ */
+int rcu_my_thread_group_empty(void)
+{
+        return thread_group_empty(current);
+}
+EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
+#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
 };
 /* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-        .donetail       = &rcu_ctrlblk.rcucblist,
+        .donetail       = &rcu_sched_ctrlblk.rcucblist,
-        .curtail        = &rcu_ctrlblk.rcucblist,
+        .curtail        = &rcu_sched_ctrlblk.rcucblist,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .curtail        = &rcu_bh_ctrlblk.rcucblist,
 };
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 #ifdef CONFIG_NO_HZ
 static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 */
 void rcu_sched_qs(int cpu)
 {
-        if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
+            rcu_qsctr_help(&rcu_bh_ctrlblk))
                raise_softirq(RCU_SOFTIRQ);
 }
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-        __rcu_process_callbacks(&rcu_ctrlblk);
+        __rcu_process_callbacks(&rcu_sched_ctrlblk);
        __rcu_process_callbacks(&rcu_bh_ctrlblk);
 }
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 *
 * Cool, huh?  (Due to Josh Triplett.)
 *
- * But we want to make this a static inline later.
+ * But we want to make this a static inline later.  The cond_resched()
+ * currently makes this problematic.
 */
 void synchronize_sched(void)
 {
@@ -195,12 +202,6 @@ void synchronize_sched(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
-void synchronize_rcu_bh(void)
-{
-        synchronize_sched();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 /*
 * Helper function for call_rcu() and call_rcu_bh().
 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
 */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_ctrlblk);
+        __call_rcu(head, func, &rcu_sched_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
@@ -244,11 +245,13 @@ void rcu_barrier(void)
 {
        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
        init_completion(&rcu.completion);
        /* Will wake me after RCU finished. */
        call_rcu(&rcu.head, wakeme_after_rcu);
        /* Wait for it. */
        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
 {
        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
        init_completion(&rcu.completion);
        /* Will wake me after RCU finished. */
        call_rcu_bh(&rcu.head, wakeme_after_rcu);
        /* Wait for it. */
        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
 {
        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
        init_completion(&rcu.completion);
        /* Will wake me after RCU finished. */
        call_rcu_sched(&rcu.head, wakeme_after_rcu);
        /* Wait for it. */
        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
@@ -280,3 +287,5 @@ void __init rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
+#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptable semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#include <linux/kernel_stat.h>
+/*
+ * During boot, we forgive RCU lockdep issues.  After this function is
+ * invoked, we start taking RCU lockdep issues seriously.
+ */
+void rcu_scheduler_starting(void)
+{
+        WARN_ON(nr_context_switches() > 0);
+        rcu_scheduler_active = 1;
+}
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;         /* Start/stop testing interval (in sec) */
 static int irqreader = 1;       /* RCU readers from irq (timers). */
+static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
+static int fqs_holdoff = 0;     /* Hold time within burst (us). */
+static int fqs_stutter = 3;     /* Wait time between bursts (s). */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
 module_param(irqreader, int, 0444);
 MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+module_param(fqs_duration, int, 0444);
+MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
+module_param(fqs_holdoff, int, 0444);
+MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+module_param(fqs_stutter, int, 0444);
+MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
+static struct task_struct *fqs_task;
 #define RCU_TORTURE_PIPE_LEN 10
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
        void (*cb_barrier)(void);
+        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
        char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
        .cb_barrier     = rcu_barrier,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu,
        .cb_barrier     = NULL,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu_expedited,
        .cb_barrier     = NULL,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_expedited"
@@ -450,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
 {
        struct rcu_bh_torture_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
        init_completion(&rcu.completion);
        call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 static struct rcu_torture_ops rcu_bh_ops = {
@@ -465,6 +481,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = rcu_bh_torture_synchronize,
        .cb_barrier     = rcu_barrier_bh,
+        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_bh"
@@ -480,6 +497,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = rcu_bh_torture_synchronize,
        .cb_barrier     = NULL,
+        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_bh_sync"
@@ -621,6 +639,7 @@ static struct rcu_torture_ops sched_ops = {
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = rcu_barrier_sched,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "sched"
@@ -636,6 +655,7 @@ static struct rcu_torture_ops sched_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = NULL,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
        .name           = "sched_sync"
 };
@@ -650,12 +670,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_sched_expedited,
        .cb_barrier     = NULL,
-        .stats          = rcu_expedited_torture_stats,
+        .fqs            = rcu_sched_force_quiescent_state,
+        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "sched_expedited"
 };
 /*
+ * RCU torture force-quiescent-state kthread.  Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+        unsigned long fqs_resume_time;
+        int fqs_burst_remaining;
+        VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+        do {
+                fqs_resume_time = jiffies + fqs_stutter * HZ;
+                while (jiffies - fqs_resume_time > LONG_MAX) {
+                        schedule_timeout_interruptible(1);
+                }
+                fqs_burst_remaining = fqs_duration;
+                while (fqs_burst_remaining > 0) {
+                        cur_ops->fqs();
+                        udelay(fqs_holdoff);
+                        fqs_burst_remaining -= fqs_holdoff;
+                }
+                rcu_stutter_wait("rcu_torture_fqs");
+        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_fqs");
+        while (!kthread_should_stop())
+                schedule_timeout_uninterruptible(1);
+        return 0;
+}
+/*
 * RCU torture writer kthread.  Repeatedly substitutes a new structure
 * for that pointed to by rcu_torture_current, freeing the old structure
 * after a series of grace periods (the "pipeline").
@@ -745,7 +798,11 @@ static void rcu_torture_timer(unsigned long unused)
        idx = cur_ops->readlock();
        completed = cur_ops->completed();
-        p = rcu_dereference(rcu_torture_current);
+        p = rcu_dereference_check(rcu_torture_current,
+                                  rcu_read_lock_held() ||
+                                  rcu_read_lock_bh_held() ||
+                                  rcu_read_lock_sched_held() ||
+                                  srcu_read_lock_held(&srcu_ctl));
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
@@ -763,13 +820,13 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
        }
-        __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+        __this_cpu_inc(rcu_torture_batch[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
 }
@@ -798,11 +855,15 @@ rcu_torture_reader(void *arg)
        do {
                if (irqreader && cur_ops->irq_capable) {
                        if (!timer_pending(&t))
-                                mod_timer(&t, 1);
+                                mod_timer(&t, jiffies + 1);
                }
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
-                p = rcu_dereference(rcu_torture_current);
+                p = rcu_dereference_check(rcu_torture_current,
+                                          rcu_read_lock_held() ||
+                                          rcu_read_lock_bh_held() ||
+                                          rcu_read_lock_sched_held() ||
+                                          srcu_read_lock_held(&srcu_ctl));
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
@@ -818,13 +879,13 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
-                __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+                __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
@@ -1030,10 +1091,11 @@ rcu_torture_print_module_parms(char *tag)
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
-                "shuffle_interval=%d stutter=%d irqreader=%d\n",
+                "shuffle_interval=%d stutter=%d irqreader=%d "
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                stutter, irqreader);
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
 }
 static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1171,12 @@ rcu_torture_cleanup(void)
        }
        stats_task = NULL;
+        if (fqs_task) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
+                kthread_stop(fqs_task);
+        }
+        fqs_task = NULL;
        /* Wait for all RCU callbacks to fire.  */
        if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1222,11 @@ rcu_torture_init(void)
                mutex_unlock(&fullstop_mutex);
                return -EINVAL;
        }
+        if (cur_ops->fqs == NULL && fqs_duration != 0) {
+                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+                                  "fqs_duration, fqs disabled.\n");
+                fqs_duration = 0;
+        }
        if (cur_ops->init)
                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1282,6 +1355,19 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
+        if (fqs_duration < 0)
+                fqs_duration = 0;
+        if (fqs_duration) {
+                /* Create the stutter thread */
+                fqs_task = kthread_run(rcu_torture_fqs, NULL,
+                                       "rcu_torture_fqs");
+                if (IS_ERR(fqs_task)) {
+                        firsterr = PTR_ERR(fqs_task);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
+                        fqs_task = NULL;
+                        goto unwind;
+                }
+        }
        register_reboot_notifier(&rcutorture_nb);
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -54,8 +54,8 @@
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
-#define RCU_STATE_INITIALIZER(name) { \
+#define RCU_STATE_INITIALIZER(structname) { \
-        .level = { &name.node[0] }, \
+        .level = { &structname.node[0] }, \
        .levelcnt = { \
                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
                NUM_RCU_LVL_1, \
@@ -66,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .signaled = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-        .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
        .orphan_cbs_list = NULL, \
-        .orphan_cbs_tail = &name.orphan_cbs_list, \
+        .orphan_cbs_tail = &structname.orphan_cbs_list, \
        .orphan_qlen = 0, \
-        .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
+        .name = #structname, \
 }
 struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -81,8 +82,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
-static int rcu_scheduler_active __read_mostly;
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 /*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
@@ -101,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 */
 void rcu_sched_qs(int cpu)
 {
-        struct rcu_data *rdp;
+        struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
-        rdp = &per_cpu(rcu_sched_data, cpu);
        rdp->passed_quiesc_completed = rdp->gpnum - 1;
        barrier();
        rdp->passed_quiesc = 1;
-        rcu_preempt_note_context_switch(cpu);
 }
 void rcu_bh_qs(int cpu)
 {
-        struct rcu_data *rdp;
+        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-        rdp = &per_cpu(rcu_bh_data, cpu);
        rdp->passed_quiesc_completed = rdp->gpnum - 1;
        barrier();
        rdp->passed_quiesc = 1;
 }
+/*
+ * Note a context switch.  This is a quiescent state for RCU-sched,
+ * and requires special handling for preemptible RCU.
+ */
+void rcu_note_context_switch(int cpu)
+{
+        rcu_sched_qs(cpu);
+        rcu_preempt_note_context_switch(cpu);
+}
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = 1,
@@ -157,6 +165,24 @@ long rcu_batches_completed_bh(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 /*
+ * Force a quiescent state for RCU BH.
+ */
+void rcu_bh_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_bh_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_sched_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+/*
 * Does the CPU have callbacks ready to be invoked?
 */
 static int
@@ -424,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_panicking __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        rsp->gp_start = jiffies;
@@ -439,10 +467,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        /* Only let one CPU complain about others per time interval. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        delta = jiffies - rsp->jiffies_stall;
        if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,23 +480,30 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * due to CPU offlining.
         */
        rcu_print_task_stall(rnp);
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /* OK, time to rat on our buddy... */
-        printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+        printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+               rsp->name);
        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                rcu_print_task_stall(rnp);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                if (rnp->qsmask == 0)
                        continue;
                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
                        if (rnp->qsmask & (1UL << cpu))
                                printk(" %d", rnp->grplo + cpu);
        }
-        printk(" (detected by %d, t=%ld jiffies)\n",
+        printk("} (detected by %d, t=%ld jiffies)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start));
        trigger_all_cpu_backtrace();
+        /* If so configured, complain about tasks blocking the grace period. */
+        rcu_print_detail_task_stall(rsp);
        force_quiescent_state(rsp, 0);  /* Kick them all. */
 }
@@ -477,15 +512,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
+        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
-                        smp_processor_id(), jiffies - rsp->gp_start);
+               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
        trigger_all_cpu_backtrace();
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
                rsp->jiffies_stall =
                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        set_need_resched();  /* kick ourselves to get things going. */
 }
@@ -495,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        long delta;
        struct rcu_node *rnp;
+        if (rcu_cpu_stall_panicking)
+                return;
        delta = jiffies - rsp->jiffies_stall;
        rnp = rdp->mynode;
        if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -509,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        }
 }
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+        rcu_cpu_stall_panicking = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block rcu_panic_block = {
+        .notifier_call = rcu_panic,
+};
+static void __init check_cpu_stall_init(void)
+{
+        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+}
 #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -519,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
+static void __init check_cpu_stall_init(void)
+{
+}
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /*
@@ -545,12 +601,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
-            !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
        }
        __note_new_gpnum(rsp, rnp, rdp);
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -609,12 +665,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_save(flags);
        rnp = rdp->mynode;
        if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
-            !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
+            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
                local_irq_restore(flags);
                return;
        }
        __rcu_process_gp_end(rsp, rnp, rdp);
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -659,12 +715,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!cpu_needs_another_gp(rsp, rdp)) {
+        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+                if (cpu_needs_another_gp(rsp, rdp))
+                        rsp->fqs_need_gp = 1;
                if (rnp->completed == rsp->completed) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
-                spin_unlock(&rnp->lock);         /* irqs remain disabled. */
+                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
                /*
                 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +730,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                 * of the next grace period to process their callbacks.
                 */
                rcu_for_each_node_breadth_first(rsp, rnp) {
-                        spin_lock(&rnp->lock);   /* irqs already disabled. */
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
                        rnp->completed = rsp->completed;
-                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
                local_irq_restore(flags);
                return;
@@ -695,15 +753,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->completed = rsp->completed;
                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+        raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
        /* Exclude any concurrent CPU-hotplug operations. */
-        spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
        /*
         * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +781,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
         * irqs disabled.
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
-                spin_lock(&rnp->lock);          /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
                rnp->completed = rsp->completed;
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
-                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
+                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
        rnp = rcu_get_root(rsp);
-        spin_lock(&rnp->lock);                  /* irqs already disabled. */
+        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
        rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
-        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -776,14 +834,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                if (!(rnp->qsmask & mask)) {
                        /* Our bit has already been cleared, so done. */
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
                rnp->qsmask &= ~mask;
                if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
                        /* Other bits still set at this level, so done. */
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
                mask = rnp->grpmask;
@@ -793,10 +851,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        break;
                }
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                rnp_c = rnp;
                rnp = rnp->parent;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                WARN_ON_ONCE(rnp_c->qsmask);
        }
@@ -825,7 +883,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
        struct rcu_node *rnp;
        rnp = rdp->mynode;
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (lastcomp != rnp->completed) {
                /*
@@ -837,12 +895,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
                 * race occurred.
                 */
                rdp->passed_quiesc = 0; /* try again later! */
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
        mask = rdp->grpmask;
        if ((rnp->qsmask & mask) == 0) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
                rdp->qs_pending = 0;
@@ -906,7 +964,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-        spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
        *rsp->orphan_cbs_tail = rdp->nxtlist;
        rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
        rdp->nxtlist = NULL;
@@ -914,7 +972,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
                rdp->nxttail[i] = &rdp->nxtlist;
        rsp->orphan_qlen += rdp->qlen;
        rdp->qlen = 0;
-        spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
+        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
 /*
@@ -925,10 +983,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        unsigned long flags;
        struct rcu_data *rdp;
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        rdp = rsp->rda[smp_processor_id()];
        if (rsp->orphan_cbs_list == NULL) {
-                spin_unlock_irqrestore(&rsp->onofflock, flags);
+                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
                return;
        }
        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +995,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        rsp->orphan_cbs_list = NULL;
        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
        rsp->orphan_qlen = 0;
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -953,23 +1011,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        struct rcu_node *rnp;
        /* Exclude any attempts to start a new grace period. */
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
        rnp = rdp->mynode;      /* this is the outgoing CPU's rnp. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
-                spin_lock(&rnp->lock);          /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rnp->qsmaskinit &= ~mask;
                if (rnp->qsmaskinit != 0) {
                        if (rnp != rdp->mynode)
-                                spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        break;
                }
                if (rnp == rdp->mynode)
                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
                else
-                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                mask = rnp->grpmask;
                rnp = rnp->parent;
        } while (rnp != NULL);
@@ -980,12 +1038,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
         * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
         * held leads to deadlock.
         */
-        spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+        raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
        rnp = rdp->mynode;
        if (need_report & RCU_OFL_TASKS_NORM_GP)
                rcu_report_unblock_qs_rnp(rnp, flags);
        else
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
@@ -1103,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        if (!rcu_pending(cpu))
-                return; /* if nothing for RCU to do. */
        if (user ||
            (idle_cpu(cpu) && rcu_scheduler_active &&
             !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1136,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
                rcu_bh_qs(cpu);
        }
        rcu_preempt_check_callbacks(cpu);
-        raise_softirq(RCU_SOFTIRQ);
+        if (rcu_pending(cpu))
+                raise_softirq(RCU_SOFTIRQ);
 }
 #ifdef CONFIG_SMP
@@ -1144,11 +1201,9 @@ void rcu_check_callbacks(int cpu, int user)
 /*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
- * Returns 1 if the current grace period ends while scanning (possibly
+ * The caller must have suppressed start of new grace periods.
- * because we made it end).
 */
-static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
-                               int (*f)(struct rcu_data *))
 {
        unsigned long bit;
        int cpu;
@@ -1158,13 +1213,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
        rcu_for_each_leaf_node(rsp, rnp) {
                mask = 0;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
-                if (rnp->completed != lastcomp) {
+                if (!rcu_gp_in_progress(rsp)) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        return 1;
+                        return;
                }
                if (rnp->qsmask == 0) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        continue;
                }
                cpu = rnp->grplo;
@@ -1173,15 +1228,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
                        if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
                                mask |= bit;
                }
-                if (mask != 0 && rnp->completed == lastcomp) {
+                if (mask != 0) {
                        /* rcu_report_qs_rnp() releases rnp->lock. */
                        rcu_report_qs_rnp(mask, rsp, rnp, flags);
                        continue;
                }
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        return 0;
 }
 /*
@@ -1191,78 +1245,65 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
        unsigned long flags;
-        long lastcomp;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        u8 signaled;
-        u8 forcenow;
        if (!rcu_gp_in_progress(rsp))
                return;  /* No grace period in progress, nothing to force. */
-        if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+        if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
                return; /* Someone else is already on the job. */
        }
-        if (relaxed &&
+        if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
-            (long)(rsp->jiffies_force_qs - jiffies) >= 0)
+                goto unlock_fqs_ret; /* no emergency and done recently. */
-                goto unlock_ret; /* no emergency and done recently. */
        rsp->n_force_qs++;
-        spin_lock(&rnp->lock);
+        raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-        lastcomp = rsp->gpnum - 1;
-        signaled = rsp->signaled;
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        if(!rcu_gp_in_progress(rsp)) {
                rsp->n_force_qs_ngp++;
-                spin_unlock(&rnp->lock);
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-                goto unlock_ret;  /* no GP in progress, time updated. */
+                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
-        spin_unlock(&rnp->lock);
+        rsp->fqs_active = 1;
-        switch (signaled) {
+        switch (rsp->signaled) {
        case RCU_GP_IDLE:
        case RCU_GP_INIT:
                break; /* grace period idle or initializing, ignore. */
        case RCU_SAVE_DYNTICK:
                if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
                        break; /* So gcc recognizes the dead code. */
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
                /* Record dyntick-idle state. */
-                if (rcu_process_dyntick(rsp, lastcomp,
+                force_qs_rnp(rsp, dyntick_save_progress_counter);
-                                        dyntick_save_progress_counter))
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-                        goto unlock_ret;
+                if (rcu_gp_in_progress(rsp))
-                /* fall into next case. */
-        case RCU_SAVE_COMPLETED:
-                /* Update state, record completion counter. */
-                forcenow = 0;
-                spin_lock(&rnp->lock);
-                if (lastcomp + 1 == rsp->gpnum &&
-                    lastcomp == rsp->completed &&
-                    rsp->signaled == signaled) {
                        rsp->signaled = RCU_FORCE_QS;
-                        rsp->completed_fqs = lastcomp;
+                break;
-                        forcenow = signaled == RCU_SAVE_COMPLETED;
-                }
-                spin_unlock(&rnp->lock);
-                if (!forcenow)
-                        break;
-                /* fall into next case. */
        case RCU_FORCE_QS:
                /* Check dyntick-idle state, send IPI to laggarts. */
-                if (rcu_process_dyntick(rsp, rsp->completed_fqs,
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-                                        rcu_implicit_dynticks_qs))
+                force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
-                        goto unlock_ret;
                /* Leave state in case more forcing is required. */
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                break;
        }
-unlock_ret:
+        rsp->fqs_active = 0;
-        spin_unlock_irqrestore(&rsp->fqslock, flags);
+        if (rsp->fqs_need_gp) {
+                raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
+                rsp->fqs_need_gp = 0;
+                rcu_start_gp(rsp, flags); /* releases rnp->lock */
+                return;
+        }
+        raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+unlock_fqs_ret:
+        raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
 }
 #else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1331,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
         * If an RCU GP has gone long enough, go check for dyntick
         * idle CPUs and, if needed, send resched IPIs.
         */
-        if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+        if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        /*
@@ -1304,7 +1345,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Does this CPU require a not-yet-started grace period? */
        if (cpu_needs_another_gp(rsp, rdp)) {
-                spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+                raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
                rcu_start_gp(rsp, flags);  /* releases above lock */
        }
@@ -1335,6 +1376,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
         * grace-period manipulations above.
         */
        smp_mb(); /* See above block comment. */
+        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
+        rcu_needs_cpu_flush();
 }
 static void
@@ -1369,7 +1413,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                unsigned long nestflag;
                struct rcu_node *rnp_root = rcu_get_root(rsp);
-                spin_lock_irqsave(&rnp_root->lock, nestflag);
+                raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
        }
@@ -1387,7 +1431,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                        force_quiescent_state(rsp, 0);
                rdp->n_force_qs_snap = rsp->n_force_qs;
                rdp->qlen_last_fqs_check = rdp->qlen;
-        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
 }
@@ -1440,11 +1484,13 @@ void synchronize_sched(void)
        if (rcu_blocking_is_gp())
                return;
+        init_rcu_head_on_stack(&rcu.head);
        init_completion(&rcu.completion);
        /* Will wake me after RCU finished. */
        call_rcu_sched(&rcu.head, wakeme_after_rcu);
        /* Wait for it. */
        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -1464,11 +1510,13 @@ void synchronize_rcu_bh(void)
        if (rcu_blocking_is_gp())
                return;
+        init_rcu_head_on_stack(&rcu.head);
        init_completion(&rcu.completion);
        /* Will wake me after RCU finished. */
        call_rcu_bh(&rcu.head, wakeme_after_rcu);
        /* Wait for it. */
        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
@@ -1489,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        check_cpu_stall(rsp, rdp);
        /* Is the RCU core waiting for a quiescent state from this CPU? */
-        if (rdp->qs_pending) {
+        if (rdp->qs_pending && !rdp->passed_quiesc) {
+                /*
+                 * If force_quiescent_state() coming soon and this CPU
+                 * needs a quiescent state, and this is either RCU-sched
+                 * or RCU-bh, force a local reschedule.
+                 */
                rdp->n_rp_qs_pending++;
+                if (!rdp->preemptable &&
+                    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
+                                 jiffies))
+                        set_need_resched();
+        } else if (rdp->qs_pending && rdp->passed_quiesc) {
+                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -1520,7 +1580,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Has an RCU GP gone long enough to send resched IPIs &c? */
        if (rcu_gp_in_progress(rsp) &&
-            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+            ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
                rdp->n_rp_need_fqs++;
                return 1;
        }
@@ -1545,10 +1605,9 @@ static int rcu_pending(int cpu)
 /*
 * Check to see if any future RCU-related work will need to be done
 * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * 1 if so.
- * an exported member of the RCU API.
 */
-int rcu_needs_cpu(int cpu)
+static int rcu_needs_cpu_quick_check(int cpu)
 {
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1615,6 @@ int rcu_needs_cpu(int cpu)
               rcu_preempt_needs_cpu(cpu);
 }
-/*
- * This function is invoked towards the end of the scheduler's initialization
- * process.  Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system).  After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections.
- */
-void rcu_scheduler_starting(void)
-{
-        WARN_ON(num_online_cpus() != 1);
-        WARN_ON(nr_context_switches() > 0);
-        rcu_scheduler_active = 1;
-}
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1703,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1713,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 #endif /* #ifdef CONFIG_NO_HZ */
        rdp->cpu = cpu;
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -1687,7 +1731,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
@@ -1695,7 +1739,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
-        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        /*
         * A new grace period might start here.  If so, we won't be part
@@ -1703,14 +1747,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
         */
        /* Exclude any attempts to start a new GP on large systems. */
-        spin_lock(&rsp->onofflock);             /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);         /* irqs already disabled. */
        /* Add CPU to rcu_node bitmasks. */
        rnp = rdp->mynode;
        mask = rdp->grpmask;
        do {
                /* Exclude any attempts to start a new GP on small systems. */
-                spin_lock(&rnp->lock);  /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rnp->qsmaskinit |= mask;
                mask = rnp->grpmask;
                if (rnp == rdp->mynode) {
@@ -1718,11 +1762,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
                        rdp->completed = rnp->completed;
                        rdp->passed_quiesc_completed = rnp->completed - 1;
                }
-                spin_unlock(&rnp->lock); /* irqs already disabled. */
+                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
        } while (rnp != NULL && !(rnp->qsmaskinit & mask));
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -1774,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 }
 /*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process.  Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system).  After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections.  This function also enables RCU lockdep checking.
+ */
+void rcu_scheduler_starting(void)
+{
+        WARN_ON(num_online_cpus() != 1);
+        WARN_ON(nr_context_switches() > 0);
+        rcu_scheduler_active = 1;
+}
+/*
 * Compute the per-level fanout, either using the exact fanout specified
 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
 */
@@ -1806,11 +1865,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 */
 static void __init rcu_init_one(struct rcu_state *rsp)
 {
+        static char *buf[] = { "rcu_node_level_0",
+                               "rcu_node_level_1",
+                               "rcu_node_level_2",
+                               "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
        int cpustride = 1;
        int i;
        int j;
        struct rcu_node *rnp;
+        BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
        /* Initialize the level-tracking arrays. */
        for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1888,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                cpustride *= rsp->levelspread[i];
                rnp = rsp->level[i];
                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
-                        spin_lock_init(&rnp->lock);
+                        raw_spin_lock_init(&rnp->lock);
-                        lockdep_set_class(&rnp->lock, &rcu_node_class[i]);
+                        lockdep_set_class_and_name(&rnp->lock,
+                                                   &rcu_node_class[i], buf[i]);
                        rnp->gpnum = 0;
                        rnp->qsmask = 0;
                        rnp->qsmaskinit = 0;
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                        INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
                }
        }
+        rnp = rsp->level[NUM_RCU_LVLS - 1];
+        for_each_possible_cpu(i) {
+                while (i > rnp->grphi)
+                        rnp++;
+                rsp->rda[i]->mynode = rnp;
+                rcu_boot_init_percpu_data(i, rsp);
+        }
 }
 /*
@@ -1859,32 +1933,18 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 #define RCU_INIT_FLAVOR(rsp, rcu_data) \
 do { \
        int i; \
-        int j; \
-        struct rcu_node *rnp; \
        \
-        rcu_init_one(rsp); \
-        rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
-        j = 0; \
        for_each_possible_cpu(i) { \
-                if (i > rnp[j].grphi) \
-                        j++; \
-                per_cpu(rcu_data, i).mynode = &rnp[j]; \
                (rsp)->rda[i] = &per_cpu(rcu_data, i); \
-                rcu_boot_init_percpu_data(i, rsp); \
        } \
+        rcu_init_one(rsp); \
 } while (0)
 void __init rcu_init(void)
 {
-        int i;
+        int cpu;
        rcu_bootup_announce();
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-        printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#if NUM_RCU_LVL_4 != 0
-        printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
-#endif /* #if NUM_RCU_LVL_4 != 0 */
        RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
        RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
        __rcu_init_preempt();
@@ -1896,8 +1956,9 @@ void __init rcu_init(void)
         * or the scheduler are operational.
         */
        cpu_notifier(rcu_cpu_notify, 0);
-        for_each_online_cpu(i)
+        for_each_online_cpu(cpu)
-                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);
+                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+        check_cpu_stall_init();
 }
 #include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
 * Definition for node within the RCU grace-period-detection hierarchy.
 */
 struct rcu_node {
-        spinlock_t lock;        /* Root rcu_node's lock protects some */
+        raw_spinlock_t lock;    /* Root rcu_node's lock protects some */
                                /*  rcu_state fields as well as following. */
-        long    gpnum;          /* Current grace period for this node. */
+        unsigned long gpnum;    /* Current grace period for this node. */
                                /*  This will either be equal to or one */
                                /*  behind the root rcu_node's gpnum. */
-        long    completed;      /* Last grace period completed for this node. */
+        unsigned long completed; /* Last GP completed for this node. */
                                /*  This will either be equal to or one */
                                /*  behind the root rcu_node's gpnum. */
        unsigned long qsmask;   /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
        /* 1) quiescent-state and grace-period handling : */
-        long            completed;      /* Track rsp->completed gp number */
+        unsigned long   completed;      /* Track rsp->completed gp number */
                                        /*  in order to detect GP end. */
-        long            gpnum;          /* Highest gp number that this CPU */
+        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
-        long            passed_quiesc_completed;
+        unsigned long   passed_quiesc_completed;
                                        /* Value of completed at time of qs. */
        bool            passed_quiesc;  /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
@@ -221,14 +221,15 @@ struct rcu_data {
        unsigned long resched_ipi;      /* Sent a resched IPI. */
        /* 5) __rcu_pending() statistics. */
-        long n_rcu_pending;             /* rcu_pending() calls since boot. */
+        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
-        long n_rp_qs_pending;
+        unsigned long n_rp_qs_pending;
-        long n_rp_cb_ready;
+        unsigned long n_rp_report_qs;
-        long n_rp_cpu_needs_gp;
+        unsigned long n_rp_cb_ready;
-        long n_rp_gp_completed;
+        unsigned long n_rp_cpu_needs_gp;
-        long n_rp_gp_started;
+        unsigned long n_rp_gp_completed;
-        long n_rp_need_fqs;
+        unsigned long n_rp_gp_started;
-        long n_rp_need_nothing;
+        unsigned long n_rp_need_fqs;
+        unsigned long n_rp_need_nothing;
        int cpu;
 };
@@ -237,25 +238,36 @@ struct rcu_data {
 #define RCU_GP_IDLE             0       /* No grace period in progress. */
 #define RCU_GP_INIT             1       /* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK        2       /* Need to scan dyntick state. */
-#define RCU_SAVE_COMPLETED      3       /* Need to save rsp->completed. */
+#define RCU_FORCE_QS            3       /* Need to force quiescent state. */
-#define RCU_FORCE_QS            4       /* Need to force quiescent state. */
 #ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
 #else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT         RCU_SAVE_COMPLETED
+#define RCU_SIGNAL_INIT         RCU_FORCE_QS
 #endif /* #else #ifdef CONFIG_NO_HZ */
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_RAT_DELAY             2         /* Allow other CPUs time */
+#define RCU_STALL_DELAY_DELTA          (5 * HZ)
-                                                  /*  to take at least one */
+#else
-                                                  /*  scheduling clock irq */
+#define RCU_STALL_DELAY_DELTA          0
-                                                  /*  before ratting on them. */
+#endif
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
+                                                /*  to take at least one */
+                                                /*  scheduling clock irq */
+                                                /*  before ratting on them. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b)      (ULONG_MAX / 2 < (a) - (b))
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
 * represented in "heap" form in a dense array.  The root (first level)
@@ -277,12 +289,19 @@ struct rcu_state {
        u8      signaled ____cacheline_internodealigned_in_smp;
                                                /* Force QS state. */
-        long    gpnum;                          /* Current gp number. */
+        u8      fqs_active;                     /* force_quiescent_state() */
-        long    completed;                      /* # of last completed gp. */
+                                                /*  is running. */
+        u8      fqs_need_gp;                    /* A CPU was prevented from */
+                                                /*  starting a new grace */
+                                                /*  period because */
+                                                /*  force_quiescent_state() */
+                                                /*  was running. */
+        unsigned long gpnum;                    /* Current gp number. */
+        unsigned long completed;                /* # of last completed gp. */
        /* End of fields guarded by root rcu_node's lock. */
-        spinlock_t onofflock;                   /* exclude on/offline and */
+        raw_spinlock_t onofflock;               /* exclude on/offline and */
                                                /*  starting new GP.  Also */
                                                /*  protects the following */
                                                /*  orphan_cbs fields. */
@@ -292,10 +311,8 @@ struct rcu_state {
                                                /*  going offline. */
        struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
        long orphan_qlen;                       /* Number of orphaned cbs. */
-        spinlock_t fqslock;                     /* Only one task forcing */
+        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
-        long    completed_fqs;                  /* Value of completed @ snap. */
-                                                /*  Protected by fqslock. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
        unsigned long n_force_qs;               /* Number of calls to */
@@ -310,6 +327,7 @@ struct rcu_state {
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+        char *name;                             /* Name of structure. */
 };
 /* Return values for rcu_preempt_offline_tasks(). */
@@ -319,8 +337,6 @@ struct rcu_state {
 #define RCU_OFL_TASKS_EXP_GP    0x2             /* Tasks blocking expedited */
                                                /*  GP were moved to root. */
-#ifdef RCU_TREE_NONCORE
 /*
 * RCU implementation internal declarations:
 */
@@ -335,7 +351,7 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#else /* #ifdef RCU_TREE_NONCORE */
+#ifndef RCU_TREE_NONCORE
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
@@ -347,6 +363,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
                                      unsigned long flags);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +384,6 @@ static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_orphanage(void);
 static void __init __rcu_init_preempt(void);
+static void rcu_needs_cpu_flush(void);
-#endif /* #else #ifdef RCU_TREE_NONCORE */
+#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
 #include <linux/delay.h>
+/*
+ * Check the RCU kernel configuration parameters and print informative
+ * messages about anything out of the ordinary.  If you like #ifdef, you
+ * will love this function.
+ */
+static void __init rcu_bootup_announce_oddness(void)
+{
+#ifdef CONFIG_RCU_TRACE
+        printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
+#endif
+#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
+        printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+               CONFIG_RCU_FANOUT);
+#endif
+#ifdef CONFIG_RCU_FANOUT_EXACT
+        printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
+#endif
+#ifdef CONFIG_RCU_FAST_NO_HZ
+        printk(KERN_INFO
+               "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+#endif
+#ifdef CONFIG_PROVE_RCU
+        printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
+        printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
+        printk(KERN_INFO
+               "\tRCU-based detection of stalled CPUs is disabled.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+        printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
+#endif
+#if NUM_RCU_LVL_4 != 0
+        printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+#endif
+}
 #ifdef CONFIG_TREE_PREEMPT_RCU
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 */
 static void __init rcu_bootup_announce(void)
 {
-        printk(KERN_INFO
+        printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
-               "Experimental preemptable hierarchical RCU implementation.\n");
+        rcu_bootup_announce_oddness();
 }
 /*
@@ -62,17 +101,32 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
+ * Force a quiescent state for preemptible RCU.
+ */
+void rcu_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_preempt_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+/*
 * Record a preemptable-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
 * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
 */
 static void rcu_preempt_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
        rdp->passed_quiesc_completed = rdp->gpnum - 1;
        barrier();
        rdp->passed_quiesc = 1;
+        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 /*
@@ -102,7 +156,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                /* Possibly blocking in an RCU read-side critical section. */
                rdp = rcu_preempt_state.rda[cpu];
                rnp = rdp->mynode;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
                t->rcu_blocked_node = rnp;
@@ -123,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
                phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
                list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        /*
@@ -135,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
         * grace period, then the fact that the task has been enqueued
         * means that we continue to block the current grace period.
         */
-        rcu_preempt_qs(cpu);
        local_irq_save(flags);
-        t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+        rcu_preempt_qs(cpu);
        local_irq_restore(flags);
 }
@@ -180,7 +233,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
        struct rcu_node *rnp_p;
        if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;  /* Still need more quiescent states! */
        }
@@ -197,8 +250,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
        /* Report up the rest of the hierarchy. */
        mask = rnp->grpmask;
-        spin_unlock(&rnp->lock);        /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
-        spin_lock(&rnp_p->lock);        /* irqs already disabled. */
+        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
        rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 }
@@ -227,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
         */
        special = t->rcu_read_unlock_special;
        if (special & RCU_READ_UNLOCK_NEED_QS) {
-                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
                rcu_preempt_qs(smp_processor_id());
        }
@@ -248,10 +300,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                for (;;) {
                        rnp = t->rcu_blocked_node;
-                        spin_lock(&rnp->lock);  /* irqs already disabled. */
+                        raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
                        if (rnp == t->rcu_blocked_node)
                                break;
-                        spin_unlock(&rnp->lock);  /* irqs remain disabled. */
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
                empty = !rcu_preempted_readers(rnp);
                empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +317,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
                 */
                if (empty)
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                else
                        rcu_report_unblock_qs_rnp(rnp, flags);
@@ -295,29 +347,73 @@ void __rcu_read_unlock(void)
        if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+        WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+        unsigned long flags;
+        struct list_head *lp;
+        int phase;
+        struct task_struct *t;
+        if (rcu_preempted_readers(rnp)) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                phase = rnp->gpnum & 0x1;
+                lp = &rnp->blocked_tasks[phase];
+                list_for_each_entry(t, lp, rcu_node_entry)
+                        sched_show_task(t);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        }
+}
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        rcu_print_detail_task_stall_rnp(rnp);
+        rcu_for_each_leaf_node(rsp, rnp)
+                rcu_print_detail_task_stall_rnp(rnp);
+}
+#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 /*
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
 */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
-        unsigned long flags;
        struct list_head *lp;
        int phase;
        struct task_struct *t;
        if (rcu_preempted_readers(rnp)) {
-                spin_lock_irqsave(&rnp->lock, flags);
                phase = rnp->gpnum & 0x1;
                lp = &rnp->blocked_tasks[phase];
                list_for_each_entry(t, lp, rcu_node_entry)
                        printk(" P%d", t->pid);
-                spin_unlock_irqrestore(&rnp->lock, flags);
        }
 }
@@ -388,11 +484,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                lp_root = &rnp_root->blocked_tasks[i];
                while (!list_empty(lp)) {
                        tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
-                        spin_lock(&rnp_root->lock); /* irqs already disabled */
+                        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
                        list_del(&tp->rcu_node_entry);
                        tp->rcu_blocked_node = rnp_root;
                        list_add(&tp->rcu_node_entry, lp_root);
-                        spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+                        raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
                }
        }
        return retval;
@@ -420,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
        struct task_struct *t = current;
        if (t->rcu_read_lock_nesting == 0) {
-                t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
                rcu_preempt_qs(cpu);
                return;
        }
@@ -462,11 +557,13 @@ void synchronize_rcu(void)
        if (!rcu_scheduler_active)
                return;
+        init_rcu_head_on_stack(&rcu.head);
        init_completion(&rcu.completion);
        /* Will wake me after RCU finished. */
        call_rcu(&rcu.head, wakeme_after_rcu);
        /* Wait for it. */
        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -516,7 +613,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
        unsigned long flags;
        unsigned long mask;
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        for (;;) {
                if (!sync_rcu_preempt_exp_done(rnp))
                        break;
@@ -525,12 +622,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
                        break;
                }
                mask = rnp->grpmask;
-                spin_unlock(&rnp->lock); /* irqs remain disabled */
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
                rnp = rnp->parent;
-                spin_lock(&rnp->lock); /* irqs already disabled */
+                raw_spin_lock(&rnp->lock); /* irqs already disabled */
                rnp->expmask &= ~mask;
        }
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -545,11 +642,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
        int must_wait;
-        spin_lock(&rnp->lock); /* irqs already disabled */
+        raw_spin_lock(&rnp->lock); /* irqs already disabled */
        list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
        list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
        must_wait = rcu_preempted_readers_exp(rnp);
-        spin_unlock(&rnp->lock); /* irqs remain disabled */
+        raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
        if (!must_wait)
                rcu_report_exp_rnp(rsp, rnp);
 }
@@ -594,13 +691,13 @@ void synchronize_rcu_expedited(void)
        /* force all RCU readers onto blocked_tasks[]. */
        synchronize_sched_expedited();
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Initialize ->expmask for all non-leaf rcu_node structures. */
        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
-                spin_lock(&rnp->lock); /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
                rnp->expmask = rnp->qsmaskinit;
-                spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
        /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +706,7 @@ void synchronize_rcu_expedited(void)
        if (NUM_RCU_NODES > 1)
                sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
        /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
        rnp = rcu_get_root(rsp);
@@ -701,6 +798,7 @@ void exit_rcu(void)
 static void __init rcu_bootup_announce(void)
 {
        printk(KERN_INFO "Hierarchical RCU implementation.\n");
+        rcu_bootup_announce_oddness();
 }
 /*
@@ -713,6 +811,16 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
+ * Force a quiescent state for RCU, which, because there is no preemptible
+ * RCU, becomes the same as rcu-sched.
+ */
+void rcu_force_quiescent_state(void)
+{
+        rcu_sched_force_quiescent_state();
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+/*
 * Because preemptable RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
@@ -734,7 +842,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
 /* Because preemptible RCU does not exist, no quieting of tasks. */
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 {
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +853,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 * Because preemptable RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
@@ -884,3 +1000,123 @@ static void __init __rcu_init_preempt(void)
 }
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we have preemptible RCU, just check whether this CPU needs
+ * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
+ * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        return rcu_needs_cpu_quick_check(cpu);
+}
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
+ * entry is not configured, so we never do need to.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+}
+#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+#define RCU_NEEDS_CPU_FLUSHES 5
+static DEFINE_PER_CPU(int, rcu_dyntick_drain);
+static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we are not supporting preemptible RCU, attempt to accelerate
+ * any current grace periods so that RCU no longer needs this CPU, but
+ * only if all other CPUs are already in dynticks-idle mode.  This will
+ * allow the CPU cores to be powered down immediately, as opposed to after
+ * waiting many milliseconds for grace periods to elapse.
+ *
+ * Because it is not legal to invoke rcu_process_callbacks() with irqs
+ * disabled, we do one pass of force_quiescent_state(), then do a
+ * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
+ * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        int c = 0;
+        int snap;
+        int snap_nmi;
+        int thatcpu;
+        /* Check for being in the holdoff period. */
+        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+                return rcu_needs_cpu_quick_check(cpu);
+        /* Don't bother unless we are the last non-dyntick-idle CPU. */
+        for_each_online_cpu(thatcpu) {
+                if (thatcpu == cpu)
+                        continue;
+                snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+                snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+                smp_mb(); /* Order sampling of snap with end of grace period. */
+                if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+                        per_cpu(rcu_dyntick_drain, cpu) = 0;
+                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                        return rcu_needs_cpu_quick_check(cpu);
+                }
+        }
+        /* Check and update the rcu_dyntick_drain sequencing. */
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+                /* First time through, initialize the counter. */
+                per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+                /* We have hit the limit, so time to give up. */
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                return rcu_needs_cpu_quick_check(cpu);
+        }
+        /* Do one step pushing remaining RCU callbacks through. */
+        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+                rcu_sched_qs(cpu);
+                force_quiescent_state(&rcu_sched_state, 0);
+                c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+        }
+        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+                rcu_bh_qs(cpu);
+                force_quiescent_state(&rcu_bh_state, 0);
+                c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+        }
+        /* If RCU callbacks are still pending, RCU still needs this CPU. */
+        if (c)
+                raise_softirq(RCU_SOFTIRQ);
+        return c;
+}
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+        int cpu = smp_processor_id();
+        unsigned long flags;
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
+                return;
+        local_irq_save(flags);
+        (void)rcu_needs_cpu(cpu);
+        local_irq_restore(flags);
+}
+#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
+        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
+        seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
                   rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
-        long gpnum;
+        unsigned long gpnum;
        int level = 0;
        int phase;
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
-        seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
 static int show_rcugp(struct seq_file *m, void *unused)
 {
 #ifdef CONFIG_TREE_PREEMPT_RCU
-        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%lu\n",
                   rcu_preempt_state.completed, rcu_preempt_state.gpnum);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%lu\n",
                   rcu_sched_state.completed, rcu_sched_state.gpnum);
-        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%lu\n",
                   rcu_bh_state.completed, rcu_bh_state.gpnum);
        return 0;
 }
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
        seq_printf(m, "%3d%cnp=%ld "
-                   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+                   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
+                   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->n_rcu_pending,
                   rdp->n_rp_qs_pending,
+                   rdp->n_rp_report_qs,
                   rdp->n_rp_cb_ready,
                   rdp->n_rp_cpu_needs_gp,
                   rdp->n_rp_gp_completed,
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
                                        "relay_hotcpu_callback: cpu %d buffer "
                                        "creation failed\n", hotcpu);
                                mutex_unlock(&relay_channels_mutex);
-                                return NOTIFY_BAD;
+                                return notifier_from_errno(-ENOMEM);
                        }
                }
                mutex_unlock(&relay_channels_mutex);
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
 /*
 *      subbuf_splice_actor - splice up to one subbuf's worth of data
 */
-static int subbuf_splice_actor(struct file *in,
+static ssize_t subbuf_splice_actor(struct file *in,
                               loff_t *ppos,
                               struct pipe_inode_info *pipe,
                               size_t len,
                               unsigned int flags,
                               int *nonpad_ret)
 {
-        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
+        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
        struct rchan_buf *rbuf = in->private_data;
        unsigned int subbuf_size = rbuf->chan->subbuf_size;
        uint64_t pos = (uint64_t) *ppos;
@@ -1231,8 +1231,8 @@ static int subbuf_splice_actor(struct file *in,
        size_t read_subbuf = read_start / subbuf_size;
        size_t padding = rbuf->padding[read_subbuf];
        size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .nr_pages = 0,
@@ -1241,9 +1241,12 @@ static int subbuf_splice_actor(struct file *in,
                .ops = &relay_pipe_buf_ops,
                .spd_release = relay_page_release,
        };
+        ssize_t ret;
        if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
                return 0;
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
        /*
         * Adjust read len, if longer than what is available
@@ -1254,7 +1257,7 @@ static int subbuf_splice_actor(struct file *in,
        subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
        pidx = (read_start / PAGE_SIZE) % subbuf_pages;
        poff = read_start & ~PAGE_MASK;
-        nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+        nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
        for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
                unsigned int this_len, this_end, private;
@@ -1288,16 +1291,19 @@ static int subbuf_splice_actor(struct file *in,
                }
        }
+        ret = 0;
        if (!spd.nr_pages)
-                return 0;
+                goto out;
        ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
        if (ret < 0 || ret < total_len)
-                return ret;
+                goto out;
        if (read_start + ret == nonpad_end)
                ret += padding;
+out:
+        splice_shrink_spd(pipe, &spd);
        return ret;
 }
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/pfn.h>
@@ -188,20 +189,65 @@ static int __release_resource(struct resource *old)
        return -EINVAL;
 }
+static void __release_child_resources(struct resource *r)
+{
+        struct resource *tmp, *p;
+        resource_size_t size;
+        p = r->child;
+        r->child = NULL;
+        while (p) {
+                tmp = p;
+                p = p->sibling;
+                tmp->parent = NULL;
+                tmp->sibling = NULL;
+                __release_child_resources(tmp);
+                printk(KERN_DEBUG "release child resource %pR\n", tmp);
+                /* need to restore size, and keep flags */
+                size = resource_size(tmp);
+                tmp->start = 0;
+                tmp->end = size - 1;
+        }
+}
+void release_child_resources(struct resource *r)
+{
+        write_lock(&resource_lock);
+        __release_child_resources(r);
+        write_unlock(&resource_lock);
+}
 /**
- * request_resource - request and reserve an I/O or memory resource
+ * request_resource_conflict - request and reserve an I/O or memory resource
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 *
- * Returns 0 for success, negative error code on error.
+ * Returns 0 for success, conflict resource on error.
 */
-int request_resource(struct resource *root, struct resource *new)
+struct resource *request_resource_conflict(struct resource *root, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __request_resource(root, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = request_resource_conflict(root, new);
        return conflict ? -EBUSY : 0;
 }
@@ -274,7 +320,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
        struct resource res;
-        unsigned long pfn, len;
+        unsigned long pfn, end_pfn;
        u64 orig_end;
        int ret = -1;
@@ -284,9 +330,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        orig_end = res.end;
        while ((res.start < res.end) &&
                (find_next_system_ram(&res, "System RAM") >= 0)) {
-                pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
+                end_pfn = (res.end + 1) >> PAGE_SHIFT;
-                ret = (*func)(pfn, len, arg);
+                if (end_pfn > pfn)
+                        ret = (*func)(pfn, end_pfn - pfn, arg);
                if (ret)
                        break;
                res.start = res.end + 1;
@@ -297,14 +344,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 #endif
+static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+        return 1;
+}
+/*
+ * This generic page_is_ram() returns true if specified address is
+ * registered as "System RAM" in iomem_resource list.
+ */
+int __weak page_is_ram(unsigned long pfn)
+{
+        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
+}
 /*
 * Find empty slot in the resource tree given range and alignment.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
                         resource_size_t max, resource_size_t align,
-                         void (*alignf)(void *, struct resource *,
+                         resource_size_t (*alignf)(void *,
-                                        resource_size_t, resource_size_t),
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
                         void *alignf_data)
 {
        struct resource *this = root->child;
@@ -330,7 +392,7 @@ static int find_resource(struct resource *root, struct resource *new,
                        tmp.end = max;
                tmp.start = ALIGN(tmp.start, align);
                if (alignf)
-                        alignf(alignf_data, &tmp, size, align);
+                        tmp.start = alignf(alignf_data, &tmp, size, align);
                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
                        new->start = tmp.start;
                        new->end = tmp.start + size - 1;
@@ -358,8 +420,10 @@ static int find_resource(struct resource *root, struct resource *new,
 int allocate_resource(struct resource *root, struct resource *new,
                      resource_size_t size, resource_size_t min,
                      resource_size_t max, resource_size_t align,
-                      void (*alignf)(void *, struct resource *,
+                      resource_size_t (*alignf)(void *,
-                                     resource_size_t, resource_size_t),
+                                                const struct resource *,
+                                                resource_size_t,
+                                                resource_size_t),
                      void *alignf_data)
 {
        int err;
@@ -426,25 +490,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
 }
 /**
- * insert_resource - Inserts a resource in the resource tree
+ * insert_resource_conflict - Inserts resource in the resource tree
 * @parent: parent of the new resource
 * @new: new resource to insert
 *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ * Returns 0 on success, conflict resource if the resource can't be inserted.
 *
- * This function is equivalent to request_resource when no conflict
+ * This function is equivalent to request_resource_conflict when no conflict
 * happens. If a conflict happens, and the conflicting resources
 * entirely fit within the range of the new resource, then the new
 * resource is inserted and the conflicting resources become children of
 * the new resource.
 */
-int insert_resource(struct resource *parent, struct resource *new)
+struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __insert_resource(parent, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = insert_resource_conflict(parent, new);
        return conflict ? -EBUSY : 0;
 }
@@ -603,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
 * release_region releases a matching busy region.
 */
+static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
 /**
 * __request_region - create a new busy resource region
 * @parent: parent resource descriptor
@@ -615,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
                                   resource_size_t start, resource_size_t n,
                                   const char *name, int flags)
 {
+        DECLARE_WAITQUEUE(wait, current);
        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
        if (!res)
@@ -639,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
                        if (!(conflict->flags & IORESOURCE_BUSY))
                                continue;
                }
+                if (conflict->flags & flags & IORESOURCE_MUXED) {
+                        add_wait_queue(&muxed_resource_wait, &wait);
+                        write_unlock(&resource_lock);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule();
+                        remove_wait_queue(&muxed_resource_wait, &wait);
+                        write_lock(&resource_lock);
+                        continue;
+                }
                /* Uhhuh, that didn't work out.. */
                kfree(res);
                res = NULL;
@@ -713,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
                                break;
                        *p = res->sibling;
                        write_unlock(&resource_lock);
+                        if (res->flags & IORESOURCE_MUXED)
+                                wake_up(&muxed_resource_wait);
                        kfree(res);
                        return;
                }
diff --git a/kernel/sched.c b/kernel/sched.c
index 4508fe7048be..f52a8801b7a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -71,6 +71,7 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -233,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 */
 static DEFINE_MUTEX(sched_domains_mutex);
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 #include <linux/cgroup.h>
@@ -243,13 +244,7 @@ static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_CGROUP_SCHED
        struct cgroup_subsys_state css;
-#endif
-#ifdef CONFIG_USER_SCHED
-        uid_t uid;
-#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
@@ -274,35 +269,7 @@ struct task_group {
        struct list_head children;
 };
-#ifdef CONFIG_USER_SCHED
-/* Helper function to pass uid information to create_sched_user() */
-void set_tg_uid(struct user_struct *user)
-{
-        user->tg->uid = user->uid;
-}
-/*
- * Root task group.
- *      Every UID task group (including init_task_group aka UID-0) will
- *      be a child to this group.
- */
-struct task_group root_task_group;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Default task group's sched entity on each cpu */
-static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
-/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -318,11 +285,7 @@ static int root_task_group_empty(void)
 }
 #endif
-#ifdef CONFIG_USER_SCHED
-# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
-#endif /* CONFIG_USER_SCHED */
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -343,47 +306,7 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 */
 struct task_group init_task_group;
-/* return group to which a task belongs */
+#endif  /* CONFIG_CGROUP_SCHED */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        struct task_group *tg;
-#ifdef CONFIG_USER_SCHED
-        rcu_read_lock();
-        tg = __task_cred(p)->user->tg;
-        rcu_read_unlock();
-#elif defined(CONFIG_CGROUP_SCHED)
-        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
-                                struct task_group, css);
-#else
-        tg = &init_task_group;
-#endif
-        return tg;
-}
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-        p->se.parent = task_group(p)->se[cpu];
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-        p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-}
-#else
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        return NULL;
-}
-#endif  /* CONFIG_GROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -478,7 +401,6 @@ struct rt_rq {
        struct rq *rq;
        struct list_head leaf_rt_rq_list;
        struct task_group *tg;
-        struct sched_rt_entity *rt_se;
 #endif
 };
@@ -535,8 +457,11 @@ struct rq {
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
+        u64 nohz_stamp;
        unsigned char in_nohz_recently;
 #endif
+        unsigned int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
@@ -573,20 +498,20 @@ struct rq {
        struct root_domain *rd;
        struct sched_domain *sd;
+        unsigned long cpu_power;
        unsigned char idle_at_tick;
        /* For active balancing */
        int post_schedule;
        int active_balance;
        int push_cpu;
+        struct cpu_stop_work active_balance_work;
        /* cpu of this runqueue: */
        int cpu;
        int online;
        unsigned long avg_load_per_task;
-        struct task_struct *migration_thread;
-        struct list_head migration_queue;
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -634,6 +559,13 @@ static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+        /*
+         * A queue event has occurred, and we're going to schedule.  In
+         * this case, we can save a useless back to back clock update.
+         */
+        if (test_tsk_need_resched(p))
+                rq->skip_clock_update = 1;
 }
 static inline int cpu_of(struct rq *rq)
@@ -645,6 +577,11 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
+#define rcu_dereference_check_sched_domain(p) \
+        rcu_dereference_check((p), \
+                              rcu_read_lock_sched_held() || \
+                              lockdep_is_held(&sched_domains_mutex))
 /*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +590,7 @@ static inline int cpu_of(struct rq *rq)
 * preempt-disabled sections.
 */
 #define for_each_domain(cpu, __sd) \
-        for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 #define this_rq()               (&__get_cpu_var(runqueues))
@@ -661,9 +598,53 @@ static inline int cpu_of(struct rq *rq)
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 #define raw_rq()                (&__raw_get_cpu_var(runqueues))
+#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification
+ * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * holds that lock for each task it moves into the cgroup. Therefore
+ * by holding that lock, we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        struct cgroup_subsys_state *css;
+        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                        lockdep_is_held(&task_rq(p)->lock));
+        return container_of(css, struct task_group, css);
+}
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
+        p->se.parent = task_group(p)->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+        p->rt.parent = task_group(p)->rt_se[cpu];
+#endif
+}
+#else /* CONFIG_CGROUP_SCHED */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        return NULL;
+}
+#endif /* CONFIG_CGROUP_SCHED */
 inline void update_rq_clock(struct rq *rq)
 {
-        rq->clock = sched_clock_cpu(cpu_of(rq));
+        if (!rq->skip_clock_update)
+                rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 /*
@@ -941,14 +922,25 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+        return unlikely(p->state == TASK_WAKING);
+}
+/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
+        struct rq *rq;
        for (;;) {
-                struct rq *rq = task_rq(p);
+                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
@@ -976,14 +968,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        }
 }
-void task_rq_unlock_wait(struct task_struct *p)
-{
-        struct rq *rq = task_rq(p);
-        smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-        raw_spin_unlock_wait(&rq->lock);
-}
 static void __task_rq_unlock(struct rq *rq)
        __releases(rq->lock)
 {
@@ -1247,6 +1231,17 @@ void wake_up_idle_cpu(int cpu)
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
 }
+int nohz_ratelimit(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        u64 diff = rq->clock - rq->nohz_stamp;
+        rq->nohz_stamp = rq->clock;
+        return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
 #endif /* CONFIG_NO_HZ */
 static u64 sched_avg_period(void)
@@ -1259,6 +1254,12 @@ static void sched_avg_update(struct rq *rq)
        s64 period = sched_avg_period();
        while ((s64)(rq->clock - rq->age_stamp) > period) {
+                /*
+                 * Inline assembly required to prevent the compiler
+                 * optimising this loop into a divmod call.
+                 * See __iter_div_u64_rem() for another example of this.
+                 */
+                asm("" : "+rm" (rq->age_stamp));
                rq->age_stamp += period;
                rq->rt_avg /= 2;
        }
@@ -1390,32 +1391,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
-        void *arg;
-        struct task_struct *(*start)(void *);
-        struct task_struct *(*next)(void *);
-};
-#ifdef CONFIG_SMP
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator);
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator);
-#endif
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
        CPUACCT_STAT_USER,      /* ... user mode */
@@ -1529,24 +1504,9 @@ static unsigned long target_load(int cpu, int type)
        return max(rq->cpu_load[type-1], total);
 }
-static struct sched_group *group_of(int cpu)
-{
-        struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
-        if (!sd)
-                return NULL;
-        return sd->groups;
-}
 static unsigned long power_of(int cpu)
 {
-        struct sched_group *group = group_of(cpu);
+        return cpu_rq(cpu)->cpu_power;
-        if (!group)
-                return SCHED_LOAD_SCALE;
-        return group->cpu_power;
 }
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1566,7 +1526,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1701,21 +1661,8 @@ static void update_shares(struct sched_domain *sd)
        }
 }
-static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-        if (root_task_group_empty())
-                return;
-        raw_spin_unlock(&rq->lock);
-        update_shares(sd);
-        raw_spin_lock(&rq->lock);
-}
 static void update_h_load(long cpu)
 {
-        if (root_task_group_empty())
-                return;
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
@@ -1725,10 +1672,6 @@ static inline void update_shares(struct sched_domain *sd)
 {
 }
-static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-}
 #endif
 #ifdef CONFIG_PREEMPT
@@ -1805,6 +1748,49 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        raw_spin_unlock(&busiest->lock);
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        if (rq1 == rq2) {
+                raw_spin_lock(&rq1->lock);
+                __acquire(rq2->lock);   /* Fake it out ;) */
+        } else {
+                if (rq1 < rq2) {
+                        raw_spin_lock(&rq1->lock);
+                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                } else {
+                        raw_spin_lock(&rq2->lock);
+                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                }
+        }
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        raw_spin_unlock(&rq1->lock);
+        if (rq1 != rq2)
+                raw_spin_unlock(&rq2->lock);
+        else
+                __release(rq2->lock);
+}
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1816,7 +1802,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif
 }
-#include "sched_stats.h"
+static const struct sched_class rt_sched_class;
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
 #define sched_class_highest (&rt_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
+#include "sched_stats.h"
 static void inc_nr_running(struct rq *rq)
 {
        rq->nr_running++;
@@ -1859,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
        if (task_has_rt_policy(p)) {
-                p->se.load.weight = prio_to_weight[0] * 2;
+                p->se.load.weight = 0;
-                p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+                p->se.load.inv_weight = WMULT_CONST;
                return;
        }
@@ -1877,40 +1859,53 @@ static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
-static void update_avg(u64 *avg, u64 sample)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-        s64 diff = sample - *avg;
+        update_rq_clock(rq);
-        *avg += diff >> 3;
+        sched_info_queued(p);
+        p->sched_class->enqueue_task(rq, p, flags);
+        p->se.on_rq = 1;
 }
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-        if (wakeup)
+        update_rq_clock(rq);
-                p->se.start_runtime = p->se.sum_exec_runtime;
+        sched_info_dequeued(p);
+        p->sched_class->dequeue_task(rq, p, flags);
+        p->se.on_rq = 0;
+}
-        sched_info_queued(p);
+/*
-        p->sched_class->enqueue_task(rq, p, wakeup);
+ * activate_task - move a task to the runqueue.
-        p->se.on_rq = 1;
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible--;
+        enqueue_task(rq, p, flags);
+        inc_nr_running(rq);
 }
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-        if (sleep) {
+        if (task_contributes_to_load(p))
-                if (p->se.last_wakeup) {
+                rq->nr_uninterruptible++;
-                        update_avg(&p->se.avg_overlap,
-                                p->se.sum_exec_runtime - p->se.last_wakeup);
-                        p->se.last_wakeup = 0;
-                } else {
-                        update_avg(&p->se.avg_wakeup,
-                                sysctl_sched_wakeup_granularity);
-                }
-        }
-        sched_info_dequeued(p);
+        dequeue_task(rq, p, flags);
-        p->sched_class->dequeue_task(rq, p, sleep);
+        dec_nr_running(rq);
-        p->se.on_rq = 0;
 }
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
 /*
 * __normal_prio - return the priority that is based on the static prio
 */
@@ -1957,30 +1952,6 @@ static int effective_prio(struct task_struct *p)
        return p->prio;
 }
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, wakeup);
-        inc_nr_running(rq);
-}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible++;
-        dequeue_task(rq, p, sleep);
-        dec_nr_running(rq);
-}
 /**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
@@ -2053,21 +2024,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
-struct migration_req {
+struct migration_arg {
-        struct list_head list;
        struct task_struct *task;
        int dest_cpu;
-        struct completion done;
 };
+static int migration_cpu_stop(void *data);
 /*
 * The task's runqueue lock must be held.
 * Returns true if you have to wait for migration thread.
 */
-static int
+static bool migrate_task(struct task_struct *p, int dest_cpu)
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
        struct rq *rq = task_rq(p);
@@ -2075,58 +2043,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
         */
-        if (!p->se.on_rq && !task_running(rq, p))
+        return p->se.on_rq || task_running(rq, p);
-                return 0;
-        init_completion(&req->done);
-        req->task = p;
-        req->dest_cpu = dest_cpu;
-        list_add(&req->list, &rq->migration_queue);
-        return 1;
-}
-/*
- * wait_task_context_switch -   wait for a thread to complete at least one
- *                              context switch.
- *
- * @p must not be current.
- */
-void wait_task_context_switch(struct task_struct *p)
-{
-        unsigned long nvcsw, nivcsw, flags;
-        int running;
-        struct rq *rq;
-        nvcsw   = p->nvcsw;
-        nivcsw  = p->nivcsw;
-        for (;;) {
-                /*
-                 * The runqueue is assigned before the actual context
-                 * switch. We need to take the runqueue lock.
-                 *
-                 * We could check initially without the lock but it is
-                 * very likely that we need to take the lock in every
-                 * iteration.
-                 */
-                rq = task_rq_lock(p, &flags);
-                running = task_running(rq, p);
-                task_rq_unlock(rq, &flags);
-                if (likely(!running))
-                        break;
-                /*
-                 * The switch count is incremented before the actual
-                 * context switch. We thus wait for two switches to be
-                 * sure at least one completed.
-                 */
-                if ((p->nvcsw - nvcsw) > 1)
-                        break;
-                if ((p->nivcsw - nivcsw) > 1)
-                        break;
-                cpu_relax();
-        }
 }
 /*
@@ -2184,7 +2101,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * just go back and repeat.
                 */
                rq = task_rq_lock(p, &flags);
-                trace_sched_wait_task(rq, p);
+                trace_sched_wait_task(p);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
                ncsw = 0;
@@ -2282,6 +2199,9 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 #ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
        int dest_cpu;
@@ -2298,12 +2218,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
        /* No more Mr. Nice Guy. */
-        if (dest_cpu >= nr_cpu_ids) {
+        if (unlikely(dest_cpu >= nr_cpu_ids)) {
-                rcu_read_lock();
+                dest_cpu = cpuset_cpus_allowed_fallback(p);
-                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-                rcu_read_unlock();
-                dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
                /*
                 * Don't tell them about moving exiting tasks or
                 * kernel threads (both mm NULL), since they never
@@ -2320,19 +2236,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * Called from:
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
- *
- *  - fork, @p is stable because it isn't on the tasklist yet
- *
- *  - exec, @p is unstable, retry loop
- *
- *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- *             we should be good.
 */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2350,6 +2259,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
        return cpu;
 }
+static void update_avg(u64 *avg, u64 sample)
+{
+        s64 diff = sample - *avg;
+        *avg += diff >> 3;
+}
 #endif
 /***
@@ -2371,16 +2286,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        struct rq *rq, *orig_rq;
+        unsigned long en_flags = ENQUEUE_WAKEUP;
+        struct rq *rq;
-        if (!sched_feat(SYNC_WAKEUPS))
-                wake_flags &= ~WF_SYNC;
        this_cpu = get_cpu();
        smp_wmb();
-        rq = orig_rq = task_rq_lock(p, &flags);
+        rq = task_rq_lock(p, &flags);
-        update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
@@ -2400,24 +2312,35 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         *
         * First fix up the nr_uninterruptible count:
         */
-        if (task_contributes_to_load(p))
+        if (task_contributes_to_load(p)) {
-                rq->nr_uninterruptible--;
+                if (likely(cpu_online(orig_cpu)))
+                        rq->nr_uninterruptible--;
+                else
+                        this_rq()->nr_uninterruptible--;
+        }
        p->state = TASK_WAKING;
-        if (p->sched_class->task_waking)
+        if (p->sched_class->task_waking) {
                p->sched_class->task_waking(rq, p);
+                en_flags |= ENQUEUE_WAKING;
+        }
-        __task_rq_unlock(rq);
+        cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
-        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
        if (cpu != orig_cpu)
                set_task_cpu(p, cpu);
+        __task_rq_unlock(rq);
-        rq = __task_rq_lock(p);
+        rq = cpu_rq(cpu);
-        update_rq_clock(rq);
+        raw_spin_lock(&rq->lock);
+        /*
+         * We migrated the task without holding either rq->lock, however
+         * since the task is not on the task list itself, nobody else
+         * will try and migrate the task, hence the rq should match the
+         * cpu we just moved it to.
+         */
+        WARN_ON(task_cpu(p) != cpu);
        WARN_ON(p->state != TASK_WAKING);
-        cpu = task_cpu(p);
 #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@ -2436,36 +2359,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 out_activate:
 #endif /* CONFIG_SMP */
-        schedstat_inc(p, se.nr_wakeups);
+        schedstat_inc(p, se.statistics.nr_wakeups);
        if (wake_flags & WF_SYNC)
-                schedstat_inc(p, se.nr_wakeups_sync);
+                schedstat_inc(p, se.statistics.nr_wakeups_sync);
        if (orig_cpu != cpu)
-                schedstat_inc(p, se.nr_wakeups_migrate);
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
        if (cpu == this_cpu)
-                schedstat_inc(p, se.nr_wakeups_local);
+                schedstat_inc(p, se.statistics.nr_wakeups_local);
        else
-                schedstat_inc(p, se.nr_wakeups_remote);
+                schedstat_inc(p, se.statistics.nr_wakeups_remote);
-        activate_task(rq, p, 1);
+        activate_task(rq, p, en_flags);
        success = 1;
-        /*
-         * Only attribute actual wakeups done by this task.
-         */
-        if (!in_interrupt()) {
-                struct sched_entity *se = &current->se;
-                u64 sample = se->sum_exec_runtime;
-                if (se->last_wakeup)
-                        sample -= se->last_wakeup;
-                else
-                        sample -= se->start_runtime;
-                update_avg(&se->avg_wakeup, sample);
-                se->last_wakeup = se->sum_exec_runtime;
-        }
 out_running:
-        trace_sched_wakeup(rq, p, success);
+        trace_sched_wakeup(p, success);
        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
@@ -2525,42 +2432,9 @@ static void __sched_fork(struct task_struct *p)
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
-        p->se.last_wakeup               = 0;
-        p->se.avg_overlap               = 0;
-        p->se.start_runtime             = 0;
-        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 #ifdef CONFIG_SCHEDSTATS
-        p->se.wait_start                        = 0;
+        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-        p->se.wait_max                          = 0;
-        p->se.wait_count                        = 0;
-        p->se.wait_sum                          = 0;
-        p->se.sleep_start                       = 0;
-        p->se.sleep_max                         = 0;
-        p->se.sum_sleep_runtime                 = 0;
-        p->se.block_start                       = 0;
-        p->se.block_max                         = 0;
-        p->se.exec_max                          = 0;
-        p->se.slice_max                         = 0;
-        p->se.nr_migrations_cold                = 0;
-        p->se.nr_failed_migrations_affine       = 0;
-        p->se.nr_failed_migrations_running      = 0;
-        p->se.nr_failed_migrations_hot          = 0;
-        p->se.nr_forced_migrations              = 0;
-        p->se.nr_wakeups                        = 0;
-        p->se.nr_wakeups_sync                   = 0;
-        p->se.nr_wakeups_migrate                = 0;
-        p->se.nr_wakeups_local                  = 0;
-        p->se.nr_wakeups_remote                 = 0;
-        p->se.nr_wakeups_affine                 = 0;
-        p->se.nr_wakeups_affine_attempts        = 0;
-        p->se.nr_wakeups_passive                = 0;
-        p->se.nr_wakeups_idle                   = 0;
 #endif
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2581,11 +2455,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
        __sched_fork(p);
        /*
-         * We mark the process as waking here. This guarantees that
+         * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
         * event cannot wake it up and insert it on the runqueue either.
         */
-        p->state = TASK_WAKING;
+        p->state = TASK_RUNNING;
        /*
         * Revert to default priority/policy on fork if requested.
@@ -2620,10 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
-#ifdef CONFIG_SMP
+        /*
-        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+         * The child is not yet in the pid-hash so no cgroup attach races,
-#endif
+         * and the cgroup is pinned to this child due to cgroup_fork()
+         * is ran before sched_fork().
+         *
+         * Silence PROVE_RCU.
+         */
+        rcu_read_lock();
        set_task_cpu(p, cpu);
+        rcu_read_unlock();
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
@@ -2652,19 +2532,37 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
+        int cpu __maybe_unused = get_cpu();
+#ifdef CONFIG_SMP
        rq = task_rq_lock(p, &flags);
-        BUG_ON(p->state != TASK_WAKING);
+        p->state = TASK_WAKING;
+        /*
+         * Fork balancing, do it here and not earlier because:
+         *  - cpus_allowed can change in the fork path
+         *  - any previously selected cpu might disappear through hotplug
+         *
+         * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+         * without people poking at ->cpus_allowed.
+         */
+        cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, cpu);
        p->state = TASK_RUNNING;
-        update_rq_clock(rq);
+        task_rq_unlock(rq, &flags);
+#endif
+        rq = task_rq_lock(p, &flags);
        activate_task(rq, p, 0);
-        trace_sched_wakeup_new(rq, p, 1);
+        trace_sched_wakeup_new(p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
 #endif
        task_rq_unlock(rq, &flags);
+        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2783,7 +2681,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
-        perf_event_task_sched_in(current, cpu_of(rq));
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+        perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
        fire_sched_in_preempt_notifiers(current);
@@ -2871,7 +2775,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
        prepare_task_switch(rq, prev, next);
-        trace_sched_switch(rq, prev, next);
+        trace_sched_switch(prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -2969,9 +2873,9 @@ unsigned long nr_iowait(void)
        return sum;
 }
-unsigned long nr_iowait_cpu(void)
+unsigned long nr_iowait_cpu(int cpu)
 {
-        struct rq *this = this_rq();
+        struct rq *this = cpu_rq(cpu);
        return atomic_read(&this->nr_iowait);
 }
@@ -2988,6 +2892,61 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
+static long calc_load_fold_active(struct rq *this_rq)
+{
+        long nr_active, delta = 0;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
+        if (nr_active != this_rq->calc_load_active) {
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+        }
+        return delta;
+}
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+static void calc_load_account_idle(struct rq *this_rq)
+{
+        long delta;
+        delta = calc_load_fold_active(this_rq);
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks_idle);
+}
+static long calc_load_fold_idle(void)
+{
+        long delta = 0;
+        /*
+         * Its got a race, we don't care...
+         */
+        if (atomic_long_read(&calc_load_tasks_idle))
+                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+        return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+static inline long calc_load_fold_idle(void)
+{
+        return 0;
+}
+#endif
 /**
 * get_avenrun - get the load average array
 * @loads:      pointer to dest load array
@@ -3034,20 +2993,22 @@ void calc_global_load(void)
 }
 /*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
 */
 static void calc_load_account_active(struct rq *this_rq)
 {
-        long nr_active, delta;
+        long delta;
-        nr_active = this_rq->nr_running;
+        if (time_before(jiffies, this_rq->calc_load_update))
-        nr_active += (long) this_rq->nr_uninterruptible;
+                return;
-        if (nr_active != this_rq->calc_load_active) {
+        delta  = calc_load_fold_active(this_rq);
-                delta = nr_active - this_rq->calc_load_active;
+        delta += calc_load_fold_idle();
-                this_rq->calc_load_active = nr_active;
+        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
-        }
+        this_rq->calc_load_update += LOAD_FREQ;
 }
 /*
@@ -3079,1871 +3040,42 @@ static void update_cpu_load(struct rq *this_rq)
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
-        if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+        calc_load_account_active(this_rq);
-                this_rq->calc_load_update += LOAD_FREQ;
-                calc_load_account_active(this_rq);
-        }
 }
 #ifdef CONFIG_SMP
 /*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        if (rq1 == rq2) {
-                raw_spin_lock(&rq1->lock);
-                __acquire(rq2->lock);   /* Fake it out ;) */
-        } else {
-                if (rq1 < rq2) {
-                        raw_spin_lock(&rq1->lock);
-                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-                } else {
-                        raw_spin_lock(&rq2->lock);
-                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-                }
-        }
-        update_rq_clock(rq1);
-        update_rq_clock(rq2);
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        raw_spin_unlock(&rq1->lock);
-        if (rq1 != rq2)
-                raw_spin_unlock(&rq2->lock);
-        else
-                __release(rq2->lock);
-}
-/*
 * sched_exec - execve() is a valuable balancing opportunity, because at
 * this point the task has the smallest effective memory and cache footprint.
 */
 void sched_exec(void)
 {
        struct task_struct *p = current;
-        struct migration_req req;
-        int dest_cpu, this_cpu;
        unsigned long flags;
        struct rq *rq;
+        int dest_cpu;
-again:
-        this_cpu = get_cpu();
-        dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
-        if (dest_cpu == this_cpu) {
-                put_cpu();
-                return;
-        }
        rq = task_rq_lock(p, &flags);
-        put_cpu();
+        dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+        if (dest_cpu == smp_processor_id())
+                goto unlock;
        /*
         * select_task_rq() can race against ->cpus_allowed
         */
-        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
+        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            || unlikely(!cpu_active(dest_cpu))) {
+            likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
-                task_rq_unlock(rq, &flags);
+                struct migration_arg arg = { p, dest_cpu };
-                goto again;
-        }
-        /* force the process onto the specified CPU */
-        if (migrate_task(p, dest_cpu, &req)) {
-                /* Need to wait for migration thread (might exit: take ref). */
-                struct task_struct *mt = rq->migration_thread;
-                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(mt);
+                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                put_task_struct(mt);
-                wait_for_completion(&req.done);
                return;
        }
+unlock:
        task_rq_unlock(rq, &flags);
 }
-/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
- */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-                      struct rq *this_rq, int this_cpu)
-{
-        deactivate_task(src_rq, p, 0);
-        set_task_cpu(p, this_cpu);
-        activate_task(this_rq, p, 0);
-        check_preempt_curr(this_rq, p, 0);
-}
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
-{
-        int tsk_cache_hot = 0;
-        /*
-         * We do not migrate tasks that are:
-         * 1) running (obviously), or
-         * 2) cannot be migrated to this CPU due to cpus_allowed, or
-         * 3) are cache-hot on their current CPU.
-         */
-        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-                schedstat_inc(p, se.nr_failed_migrations_affine);
-                return 0;
-        }
-        *all_pinned = 0;
-        if (task_running(rq, p)) {
-                schedstat_inc(p, se.nr_failed_migrations_running);
-                return 0;
-        }
-        /*
-         * Aggressive migration if:
-         * 1) task is cache cold, or
-         * 2) too many balance attempts have failed.
-         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
-        if (!tsk_cache_hot ||
-                sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-                if (tsk_cache_hot) {
-                        schedstat_inc(sd, lb_hot_gained[idle]);
-                        schedstat_inc(p, se.nr_forced_migrations);
-                }
-#endif
-                return 1;
-        }
-        if (tsk_cache_hot) {
-                schedstat_inc(p, se.nr_failed_migrations_hot);
-                return 0;
-        }
-        return 1;
-}
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator)
-{
-        int loops = 0, pulled = 0, pinned = 0;
-        struct task_struct *p;
-        long rem_load_move = max_load_move;
-        if (max_load_move == 0)
-                goto out;
-        pinned = 1;
-        /*
-         * Start the load-balancing iterator:
-         */
-        p = iterator->start(iterator->arg);
-next:
-        if (!p || loops++ > sysctl_sched_nr_migrate)
-                goto out;
-        if ((p->se.load.weight >> 1) > rem_load_move ||
-            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-        pull_task(busiest, p, this_rq, this_cpu);
-        pulled++;
-        rem_load_move -= p->se.load.weight;
-#ifdef CONFIG_PREEMPT
-        /*
-         * NEWIDLE balancing is a source of latency, so preemptible kernels
-         * will stop after the first task is pulled to minimize the critical
-         * section.
-         */
-        if (idle == CPU_NEWLY_IDLE)
-                goto out;
-#endif
-        /*
-         * We only want to steal up to the prescribed amount of weighted load.
-         */
-        if (rem_load_move > 0) {
-                if (p->prio < *this_best_prio)
-                        *this_best_prio = p->prio;
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-out:
-        /*
-         * Right now, this is one of only two places pull_task() is called,
-         * so we can safely collect pull_task() stats here rather than
-         * inside pull_task().
-         */
-        schedstat_add(sd, lb_gained[idle], pulled);
-        if (all_pinned)
-                *all_pinned = pinned;
-        return max_load_move - rem_load_move;
-}
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      unsigned long max_load_move,
-                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned)
-{
-        const struct sched_class *class = sched_class_highest;
-        unsigned long total_load_moved = 0;
-        int this_best_prio = this_rq->curr->prio;
-        do {
-                total_load_moved +=
-                        class->load_balance(this_rq, this_cpu, busiest,
-                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned, &this_best_prio);
-                class = class->next;
-#ifdef CONFIG_PREEMPT
-                /*
-                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
-                 * the critical section.
-                 */
-                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                        break;
-#endif
-        } while (class && max_load_move > total_load_moved);
-        return total_load_moved > 0;
-}
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator)
-{
-        struct task_struct *p = iterator->start(iterator->arg);
-        int pinned = 0;
-        while (p) {
-                if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                        pull_task(busiest, p, this_rq, this_cpu);
-                        /*
-                         * Right now, this is only the second place pull_task()
-                         * is called, so we can safely collect pull_task()
-                         * stats here rather than inside pull_task().
-                         */
-                        schedstat_inc(sd, lb_gained[idle]);
-                        return 1;
-                }
-                p = iterator->next(iterator->arg);
-        }
-        return 0;
-}
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                         struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        const struct sched_class *class;
-        for_each_class(class) {
-                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
-                        return 1;
-        }
-        return 0;
-}
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *              during load balancing.
- */
-struct sd_lb_stats {
-        struct sched_group *busiest; /* Busiest group in this sd */
-        struct sched_group *this;  /* Local group in this sd */
-        unsigned long total_load;  /* Total load of all groups in sd */
-        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        unsigned long avg_load;    /* Average load across all groups in sd */
-        /** Statistics of this group */
-        unsigned long this_load;
-        unsigned long this_load_per_task;
-        unsigned long this_nr_running;
-        /* Statistics of the busiest group */
-        unsigned long max_load;
-        unsigned long busiest_load_per_task;
-        unsigned long busiest_nr_running;
-        int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        int power_savings_balance; /* Is powersave balance needed for this sd */
-        struct sched_group *group_min; /* Least loaded group in sd */
-        struct sched_group *group_leader; /* Group which relieves group_min */
-        unsigned long min_load_per_task; /* load_per_task in group_min */
-        unsigned long leader_nr_running; /* Nr running of group_leader */
-        unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
-        unsigned long avg_load; /*Avg load across the CPUs of the group */
-        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_nr_running; /* Nr tasks running in the group */
-        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-        unsigned long group_capacity;
-        int group_imb; /* Is there an imbalance in the group ? */
-};
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-        return cpumask_first(sched_group_cpus(group));
-}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                        enum cpu_idle_type idle)
-{
-        int load_idx;
-        switch (idle) {
-        case CPU_NOT_IDLE:
-                load_idx = sd->busy_idx;
-                break;
-        case CPU_NEWLY_IDLE:
-                load_idx = sd->newidle_idx;
-                break;
-        default:
-                load_idx = sd->idle_idx;
-                break;
-        }
-        return load_idx;
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        /*
-         * Busy processors will not participate in power savings
-         * balance.
-         */
-        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                sds->power_savings_balance = 0;
-        else {
-                sds->power_savings_balance = 1;
-                sds->min_nr_running = ULONG_MAX;
-                sds->leader_nr_running = 0;
-        }
-}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- *              load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        if (!sds->power_savings_balance)
-                return;
-        /*
-         * If the local group is idle or completely loaded
-         * no need to do power savings balance at this domain
-         */
-        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-                                !sds->this_nr_running))
-                sds->power_savings_balance = 0;
-        /*
-         * If a group is already running at full capacity or idle,
-         * don't include that group in power savings calculations
-         */
-        if (!sds->power_savings_balance ||
-                sgs->sum_nr_running >= sgs->group_capacity ||
-                !sgs->sum_nr_running)
-                return;
-        /*
-         * Calculate the group which has the least non-idle load.
-         * This is the group from where we need to pick up the load
-         * for saving power
-         */
-        if ((sgs->sum_nr_running < sds->min_nr_running) ||
-            (sgs->sum_nr_running == sds->min_nr_running &&
-             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-                sds->group_min = group;
-                sds->min_nr_running = sgs->sum_nr_running;
-                sds->min_load_per_task = sgs->sum_weighted_load /
-                                                sgs->sum_nr_running;
-        }
-        /*
-         * Calculate the group which is almost near its
-         * capacity but still has some space to pick up some load
-         * from other group and save more power
-         */
-        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-                return;
-        if (sgs->sum_nr_running > sds->leader_nr_running ||
-            (sgs->sum_nr_running == sds->leader_nr_running &&
-             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-                sds->group_leader = group;
-                sds->leader_nr_running = sgs->sum_nr_running;
-        }
-}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *      under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        if (!sds->power_savings_balance)
-                return 0;
-        if (sds->this != sds->group_leader ||
-                        sds->group_leader == sds->group_min)
-                return 0;
-        *imbalance = sds->min_load_per_task;
-        sds->busiest = sds->group_min;
-        return 1;
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        return;
-}
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        return;
-}
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return SCHED_LOAD_SCALE;
-}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_freq_power(sd, cpu);
-}
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long smt_gain = sd->smt_gain;
-        smt_gain /= weight;
-        return smt_gain;
-}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_smt_power(sd, cpu);
-}
-unsigned long scale_rt_power(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
-        sched_avg_update(rq);
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
-        if (unlikely((s64)total < SCHED_LOAD_SCALE))
-                total = SCHED_LOAD_SCALE;
-        total >>= SCHED_LOAD_SHIFT;
-        return div_u64(available, total);
-}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long power = SCHED_LOAD_SCALE;
-        struct sched_group *sdg = sd->groups;
-        if (sched_feat(ARCH_POWER))
-                power *= arch_scale_freq_power(sd, cpu);
-        else
-                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                if (sched_feat(ARCH_POWER))
-                        power *= arch_scale_smt_power(sd, cpu);
-                else
-                        power *= default_scale_smt_power(sd, cpu);
-                power >>= SCHED_LOAD_SHIFT;
-        }
-        power *= scale_rt_power(cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if (!power)
-                power = 1;
-        sdg->cpu_power = power;
-}
-static void update_group_power(struct sched_domain *sd, int cpu)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
-        if (!child) {
-                update_cpu_power(sd, cpu);
-                return;
-        }
-        power = 0;
-        group = child->groups;
-        do {
-                power += group->cpu_power;
-                group = group->next;
-        } while (group != child->groups);
-        sdg->cpu_power = power;
-}
-/**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
- * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
- * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sgs: variable to hold the statistics for this group.
- */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
-                        struct sched_group *group, int this_cpu,
-                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
-                        int local_group, const struct cpumask *cpus,
-                        int *balance, struct sg_lb_stats *sgs)
-{
-        unsigned long load, max_cpu_load, min_cpu_load;
-        int i;
-        unsigned int balance_cpu = -1, first_idle_cpu = 0;
-        unsigned long sum_avg_load_per_task;
-        unsigned long avg_load_per_task;
-        if (local_group) {
-                balance_cpu = group_first_cpu(group);
-                if (balance_cpu == this_cpu)
-                        update_group_power(sd, this_cpu);
-        }
-        /* Tally up the load of all CPUs in the group */
-        sum_avg_load_per_task = avg_load_per_task = 0;
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-                struct rq *rq = cpu_rq(i);
-                if (*sd_idle && rq->nr_running)
-                        *sd_idle = 0;
-                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
-                                first_idle_cpu = 1;
-                                balance_cpu = i;
-                        }
-                        load = target_load(i, load_idx);
-                } else {
-                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                }
-                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
-                sgs->sum_weighted_load += weighted_cpuload(i);
-                sum_avg_load_per_task += cpu_avg_load_per_task(i);
-        }
-        /*
-         * First idle cpu or the first cpu(busiest) in this sched group
-         * is eligible for doing load balancing at this and above
-         * domains. In the newly idle case, we will allow all the cpu's
-         * to do the newly idle load balance.
-         */
-        if (idle != CPU_NEWLY_IDLE && local_group &&
-            balance_cpu != this_cpu && balance) {
-                *balance = 0;
-                return;
-        }
-        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of two tasks.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
-        avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
-                group->cpu_power;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-                sgs->group_imb = 1;
-        sgs->group_capacity =
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-}
-/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sds: variable to hold the statistics for this sched_domain.
- */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                        enum cpu_idle_type idle, int *sd_idle,
-                        const struct cpumask *cpus, int *balance,
-                        struct sd_lb_stats *sds)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group = sd->groups;
-        struct sg_lb_stats sgs;
-        int load_idx, prefer_sibling = 0;
-        if (child && child->flags & SD_PREFER_SIBLING)
-                prefer_sibling = 1;
-        init_sd_power_savings_stats(sd, sds, idle);
-        load_idx = get_sd_load_idx(sd, idle);
-        do {
-                int local_group;
-                local_group = cpumask_test_cpu(this_cpu,
-                                               sched_group_cpus(group));
-                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
-                                local_group, cpus, balance, &sgs);
-                if (local_group && balance && !(*balance))
-                        return;
-                sds->total_load += sgs.group_load;
-                sds->total_pwr += group->cpu_power;
-                /*
-                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the group capacity to one so that we'll try
-                 * and move all the excess tasks away.
-                 */
-                if (prefer_sibling)
-                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
-                if (local_group) {
-                        sds->this_load = sgs.avg_load;
-                        sds->this = group;
-                        sds->this_nr_running = sgs.sum_nr_running;
-                        sds->this_load_per_task = sgs.sum_weighted_load;
-                } else if (sgs.avg_load > sds->max_load &&
-                           (sgs.sum_nr_running > sgs.group_capacity ||
-                                sgs.group_imb)) {
-                        sds->max_load = sgs.avg_load;
-                        sds->busiest = group;
-                        sds->busiest_nr_running = sgs.sum_nr_running;
-                        sds->busiest_load_per_task = sgs.sum_weighted_load;
-                        sds->group_imb = sgs.group_imb;
-                }
-                update_sd_power_savings_stats(group, sds, local_group, &sgs);
-                group = group->next;
-        } while (group != sd->groups);
-}
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *                      amongst the groups of a sched_domain, during
- *                      load balancing.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
- */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
-                                int this_cpu, unsigned long *imbalance)
-{
-        unsigned long tmp, pwr_now = 0, pwr_move = 0;
-        unsigned int imbn = 2;
-        if (sds->this_nr_running) {
-                sds->this_load_per_task /= sds->this_nr_running;
-                if (sds->busiest_load_per_task >
-                                sds->this_load_per_task)
-                        imbn = 1;
-        } else
-                sds->this_load_per_task =
-                        cpu_avg_load_per_task(this_cpu);
-        if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
-                        sds->busiest_load_per_task * imbn) {
-                *imbalance = sds->busiest_load_per_task;
-                return;
-        }
-        /*
-         * OK, we don't have enough imbalance to justify moving tasks,
-         * however we may be able to increase total CPU power used by
-         * moving them.
-         */
-        pwr_now += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load);
-        pwr_now += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load);
-        pwr_now /= SCHED_LOAD_SCALE;
-        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                sds->busiest->cpu_power;
-        if (sds->max_load > tmp)
-                pwr_move += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load - tmp);
-        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->cpu_power <
-                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-                tmp = (sds->max_load * sds->busiest->cpu_power) /
-                        sds->this->cpu_power;
-        else
-                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                        sds->this->cpu_power;
-        pwr_move += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load + tmp);
-        pwr_move /= SCHED_LOAD_SCALE;
-        /* Move if we gain throughput */
-        if (pwr_move > pwr_now)
-                *imbalance = sds->busiest_load_per_task;
-}
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *                       groups of a given sched_domain during load balance.
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
- */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
-                unsigned long *imbalance)
-{
-        unsigned long max_pull;
-        /*
-         * In the presence of smp nice balancing, certain scenarios can have
-         * max load less than avg load(as we skip the groups at or below
-         * its cpu_power, while calculating max_load..)
-         */
-        if (sds->max_load < sds->avg_load) {
-                *imbalance = 0;
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-        }
-        /* Don't want to pull so many tasks that a group would go idle */
-        max_pull = min(sds->max_load - sds->avg_load,
-                        sds->max_load - sds->busiest_load_per_task);
-        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->cpu_power,
-                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
-                        / SCHED_LOAD_SCALE;
-        /*
-         * if *imbalance is less than the average load per runnable task
-         * there is no gaurantee that any tasks will be moved so we'll have
-         * a think about bumping its value to force at least one task to be
-         * moved
-         */
-        if (*imbalance < sds->busiest_load_per_task)
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-}
-/******* find_busiest_group() helpers end here *********************/
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
- *
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
- *
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- *      is the appropriate cpu to perform load balancing at this_level.
- *
- * Returns:     - the busiest group if imbalance exists.
- *              - If no imbalance and user has opted for power-savings balance,
- *                 return the least loaded group whose CPUs can be
- *                 put to idle by rebalancing its tasks onto our group.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   int *sd_idle, const struct cpumask *cpus, int *balance)
-{
-        struct sd_lb_stats sds;
-        memset(&sds, 0, sizeof(sds));
-        /*
-         * Compute the various statistics relavent for load balancing at
-         * this level.
-         */
-        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-                                        balance, &sds);
-        /* Cases where imbalance does not exist from POV of this_cpu */
-        /* 1) this_cpu is not the appropriate cpu to perform load balancing
-         *    at this level.
-         * 2) There is no busy sibling group to pull from.
-         * 3) This group is the busiest group.
-         * 4) This group is more busy than the avg busieness at this
-         *    sched_domain.
-         * 5) The imbalance is within the specified limit.
-         * 6) Any rebalance would lead to ping-pong
-         */
-        if (balance && !(*balance))
-                goto ret;
-        if (!sds.busiest || sds.busiest_nr_running == 0)
-                goto out_balanced;
-        if (sds.this_load >= sds.max_load)
-                goto out_balanced;
-        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-        if (sds.this_load >= sds.avg_load)
-                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                goto out_balanced;
-        sds.busiest_load_per_task /= sds.busiest_nr_running;
-        if (sds.group_imb)
-                sds.busiest_load_per_task =
-                        min(sds.busiest_load_per_task, sds.avg_load);
-        /*
-         * We're trying to get all the cpus to the average_load, so we don't
-         * want to push ourselves above the average load, nor do we wish to
-         * reduce the max loaded cpu below the average load, as either of these
-         * actions would just result in more rebalancing later, and ping-pong
-         * tasks around. Thus we look for the minimum possible imbalance.
-         * Negative imbalances (*we* are more loaded than anyone else) will
-         * be counted as no imbalance for these purposes -- we can't fix that
-         * by pulling tasks to us. Be careful of negative numbers as they'll
-         * appear as very large values with unsigned longs.
-         */
-        if (sds.max_load <= sds.busiest_load_per_task)
-                goto out_balanced;
-        /* Looks like there is an imbalance. Compute it */
-        calculate_imbalance(&sds, this_cpu, imbalance);
-        return sds.busiest;
-out_balanced:
-        /*
-         * There is no obvious imbalance. But check if we can do some balancing
-         * to save power.
-         */
-        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-                return sds.busiest;
-ret:
-        *imbalance = 0;
-        return NULL;
-}
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
- */
-static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                   unsigned long imbalance, const struct cpumask *cpus)
-{
-        struct rq *busiest = NULL, *rq;
-        unsigned long max_load = 0;
-        int i;
-        for_each_cpu(i, sched_group_cpus(group)) {
-                unsigned long power = power_of(i);
-                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
-                unsigned long wl;
-                if (!cpumask_test_cpu(i, cpus))
-                        continue;
-                rq = cpu_rq(i);
-                wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
-                wl /= power;
-                if (capacity && rq->nr_running == 1 && wl > imbalance)
-                        continue;
-                if (wl > max_load) {
-                        max_load = wl;
-                        busiest = rq;
-                }
-        }
-        return busiest;
-}
-/*
- * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- * so long as it is large enough.
- */
-#define MAX_PINNED_INTERVAL     512
-/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- */
-static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance)
-{
-        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
-        struct sched_group *group;
-        unsigned long imbalance;
-        struct rq *busiest;
-        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as CPU_IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[idle]);
-redo:
-        update_shares(sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                   cpus, balance);
-        if (*balance == 0)
-                goto out_balanced;
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[idle]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, idle, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[idle]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[idle], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /*
-                 * Attempt to move tasks. If find_busiest_group has found
-                 * an imbalance but busiest->nr_running <= 1, the group is
-                 * still unbalanced. ld_moved simply stays zero, so it is
-                 * correctly treated as an imbalance.
-                 */
-                local_irq_save(flags);
-                double_rq_lock(this_rq, busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                      imbalance, sd, idle, &all_pinned);
-                double_rq_unlock(this_rq, busiest);
-                local_irq_restore(flags);
-                /*
-                 * some other cpu did the load balance for us.
-                 */
-                if (ld_moved && this_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
-                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                        goto out_balanced;
-                }
-        }
-        if (!ld_moved) {
-                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
-                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        raw_spin_lock_irqsave(&busiest->lock, flags);
-                        /* don't kick the migration_thread, if the curr
-                         * task on busiest cpu can't be moved to this_cpu
-                         */
-                        if (!cpumask_test_cpu(this_cpu,
-                                              &busiest->curr->cpus_allowed)) {
-                                raw_spin_unlock_irqrestore(&busiest->lock,
-                                                            flags);
-                                all_pinned = 1;
-                                goto out_one_pinned;
-                        }
-                        if (!busiest->active_balance) {
-                                busiest->active_balance = 1;
-                                busiest->push_cpu = this_cpu;
-                                active_balance = 1;
-                        }
-                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
-                        if (active_balance)
-                                wake_up_process(busiest->migration_thread);
-                        /*
-                         * We've kicked active balancing, reset the failure
-                         * counter.
-                         */
-                        sd->nr_balance_failed = sd->cache_nice_tries+1;
-                }
-        } else
-                sd->nr_balance_failed = 0;
-        if (likely(!active_balance)) {
-                /* We were unbalanced, so reset the balancing interval */
-                sd->balance_interval = sd->min_interval;
-        } else {
-                /*
-                 * If we've begun active balancing, start to back off. This
-                 * case may not be covered by the all_pinned logic if there
-                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
-                 */
-                if (sd->balance_interval < sd->max_interval)
-                        sd->balance_interval *= 2;
-        }
-        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        goto out;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[idle]);
-        sd->nr_balance_failed = 0;
-out_one_pinned:
-        /* tune up the balancing interval */
-        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
-                        (sd->balance_interval < sd->max_interval))
-                sd->balance_interval *= 2;
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        else
-                ld_moved = 0;
-out:
-        if (ld_moved)
-                update_shares(sd);
-        return ld_moved;
-}
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
- * this_rq is locked.
- */
-static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
-{
-        struct sched_group *group;
-        struct rq *busiest = NULL;
-        unsigned long imbalance;
-        int ld_moved = 0;
-        int sd_idle = 0;
-        int all_pinned = 0;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
-redo:
-        update_shares_locked(this_rq, sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
-                                   &sd_idle, cpus, NULL);
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /* Attempt to move tasks */
-                double_lock_balance(this_rq, busiest);
-                /* this_rq->clock is already updated */
-                update_rq_clock(busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                        imbalance, sd, CPU_NEWLY_IDLE,
-                                        &all_pinned);
-                double_unlock_balance(this_rq, busiest);
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                }
-        }
-        if (!ld_moved) {
-                int active_balance = 0;
-                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
-                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                        return -1;
-                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-                        return -1;
-                if (sd->nr_balance_failed++ < 2)
-                        return -1;
-                /*
-                 * The only task running in a non-idle cpu can be moved to this
-                 * cpu in an attempt to completely freeup the other CPU
-                 * package. The same method used to move task in load_balance()
-                 * have been extended for load_balance_newidle() to speedup
-                 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
-                 *
-                 * The package power saving logic comes from
-                 * find_busiest_group().  If there are no imbalance, then
-                 * f_b_g() will return NULL.  However when sched_mc={1,2} then
-                 * f_b_g() will select a group from which a running task may be
-                 * pulled to this cpu in order to make the other package idle.
-                 * If there is no opportunity to make a package idle and if
-                 * there are no imbalance, then f_b_g() will return NULL and no
-                 * action will be taken in load_balance_newidle().
-                 *
-                 * Under normal task pull operation due to imbalance, there
-                 * will be more than one task in the source run queue and
-                 * move_tasks() will succeed.  ld_moved will be true and this
-                 * active balance code will not be triggered.
-                 */
-                /* Lock busiest in correct order while this_rq is held */
-                double_lock_balance(this_rq, busiest);
-                /*
-                 * don't kick the migration_thread, if the curr
-                 * task on busiest cpu can't be moved to this_cpu
-                 */
-                if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
-                        double_unlock_balance(this_rq, busiest);
-                        all_pinned = 1;
-                        return ld_moved;
-                }
-                if (!busiest->active_balance) {
-                        busiest->active_balance = 1;
-                        busiest->push_cpu = this_cpu;
-                        active_balance = 1;
-                }
-                double_unlock_balance(this_rq, busiest);
-                /*
-                 * Should not call ttwu while holding a rq->lock
-                 */
-                raw_spin_unlock(&this_rq->lock);
-                if (active_balance)
-                        wake_up_process(busiest->migration_thread);
-                raw_spin_lock(&this_rq->lock);
-        } else
-                sd->nr_balance_failed = 0;
-        update_shares_locked(this_rq, sd);
-        return ld_moved;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
-        sd->nr_balance_failed = 0;
-        return 0;
-}
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static void idle_balance(int this_cpu, struct rq *this_rq)
-{
-        struct sched_domain *sd;
-        int pulled_task = 0;
-        unsigned long next_balance = jiffies + HZ;
-        this_rq->idle_stamp = this_rq->clock;
-        if (this_rq->avg_idle < sysctl_sched_migration_cost)
-                return;
-        for_each_domain(this_cpu, sd) {
-                unsigned long interval;
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                if (sd->flags & SD_BALANCE_NEWIDLE)
-                        /* If we've pulled tasks over stop searching: */
-                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                           sd);
-                interval = msecs_to_jiffies(sd->balance_interval);
-                if (time_after(next_balance, sd->last_balance + interval))
-                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
-                        this_rq->idle_stamp = 0;
-                        break;
-                }
-        }
-        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-                /*
-                 * We are going idle. next_balance may be set based on
-                 * a busy processor. So reset next_balance.
-                 */
-                this_rq->next_balance = next_balance;
-        }
-}
-/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
- */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
-{
-        int target_cpu = busiest_rq->push_cpu;
-        struct sched_domain *sd;
-        struct rq *target_rq;
-        /* Is there any task to move? */
-        if (busiest_rq->nr_running <= 1)
-                return;
-        target_rq = cpu_rq(target_cpu);
-        /*
-         * This condition is "impossible", if it occurs
-         * we need to fix it. Originally reported by
-         * Bjorn Helgaas on a 128-cpu setup.
-         */
-        BUG_ON(busiest_rq == target_rq);
-        /* move a task from busiest_rq to target_rq */
-        double_lock_balance(busiest_rq, target_rq);
-        update_rq_clock(busiest_rq);
-        update_rq_clock(target_rq);
-        /* Search for an sd spanning us and the target CPU. */
-        for_each_domain(target_cpu, sd) {
-                if ((sd->flags & SD_LOAD_BALANCE) &&
-                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                                break;
-        }
-        if (likely(sd)) {
-                schedstat_inc(sd, alb_count);
-                if (move_one_task(target_rq, target_cpu, busiest_rq,
-                                  sd, CPU_IDLE))
-                        schedstat_inc(sd, alb_pushed);
-                else
-                        schedstat_inc(sd, alb_failed);
-        }
-        double_unlock_balance(busiest_rq, target_rq);
-}
-#ifdef CONFIG_NO_HZ
-static struct {
-        atomic_t load_balancer;
-        cpumask_var_t cpu_mask;
-        cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-        .load_balancer = ATOMIC_INIT(-1),
-};
-int get_nohz_load_balancer(void)
-{
-        return atomic_read(&nohz.load_balancer);
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:        The cpu whose lowest level of sched domain is to
- *              be returned.
- * @flag:       The flag to check for the lowest sched_domain
- *              for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-        struct sched_domain *sd;
-        for_each_domain(cpu, sd)
-                if (sd && (sd->flags & flag))
-                        break;
-        return sd;
-}
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:        The cpu whose domains we're iterating over.
- * @sd:         variable holding the value of the power_savings_sd
- *              for cpu.
- * @flag:       The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-        for (sd = lowest_flag_domain(cpu, flag); \
-                (sd && (sd->flags & flag)); sd = sd->parent)
-/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group:  group to be checked for semi-idleness
- *
- * Returns:     1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
-        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
-                                        sched_group_cpus(ilb_group));
-        /*
-         * A sched_group is semi-idle when it has atleast one busy cpu
-         * and atleast one idle cpu.
-         */
-        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
-                return 0;
-        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
-                return 0;
-        return 1;
-}
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:        The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:     Returns the id of the idle load balancer if it exists,
- *              Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
-{
-        struct sched_domain *sd;
-        struct sched_group *ilb_group;
-        /*
-         * Have idle load balancer selection from semi-idle packages only
-         * when power-aware load balancing is enabled
-         */
-        if (!(sched_smt_power_savings || sched_mc_power_savings))
-                goto out_done;
-        /*
-         * Optimize for the case when we have no idle CPUs or only one
-         * idle CPU. Don't walk the sched_domain hierarchy in such cases
-         */
-        if (cpumask_weight(nohz.cpu_mask) < 2)
-                goto out_done;
-        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilb_group = sd->groups;
-                do {
-                        if (is_semi_idle_group(ilb_group))
-                                return cpumask_first(nohz.ilb_grp_nohz_mask);
-                        ilb_group = ilb_group->next;
-                } while (ilb_group != sd->groups);
-        }
-out_done:
-        return cpumask_first(nohz.cpu_mask);
-}
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-        return cpumask_first(nohz.cpu_mask);
-}
-#endif
-/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
- *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
- *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
- */
-int select_nohz_load_balancer(int stop_tick)
-{
-        int cpu = smp_processor_id();
-        if (stop_tick) {
-                cpu_rq(cpu)->in_nohz_recently = 1;
-                if (!cpu_active(cpu)) {
-                        if (atomic_read(&nohz.load_balancer) != cpu)
-                                return 0;
-                        /*
-                         * If we are going offline and still the leader,
-                         * give up!
-                         */
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-                        return 0;
-                }
-                cpumask_set_cpu(cpu, nohz.cpu_mask);
-                /* time for ilb owner also to sleep */
-                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
-                        if (atomic_read(&nohz.load_balancer) == cpu)
-                                atomic_set(&nohz.load_balancer, -1);
-                        return 0;
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /* make me the ilb owner */
-                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu) {
-                        int new_ilb;
-                        if (!(sched_smt_power_savings ||
-                                                sched_mc_power_savings))
-                                return 1;
-                        /*
-                         * Check to see if there is a more power-efficient
-                         * ilb.
-                         */
-                        new_ilb = find_new_ilb(cpu);
-                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                                atomic_set(&nohz.load_balancer, -1);
-                                resched_cpu(new_ilb);
-                                return 0;
-                        }
-                        return 1;
-                }
-        } else {
-                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-                        return 0;
-                cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                if (atomic_read(&nohz.load_balancer) == cpu)
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-        }
-        return 0;
-}
-#endif
-static DEFINE_SPINLOCK(balancing);
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
-{
-        int balance = 1;
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long interval;
-        struct sched_domain *sd;
-        /* Earliest time when we have to do rebalance again */
-        unsigned long next_balance = jiffies + 60*HZ;
-        int update_next_balance = 0;
-        int need_serialize;
-        for_each_domain(cpu, sd) {
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                interval = sd->balance_interval;
-                if (idle != CPU_IDLE)
-                        interval *= sd->busy_factor;
-                /* scale ms to jiffies */
-                interval = msecs_to_jiffies(interval);
-                if (unlikely(!interval))
-                        interval = 1;
-                if (interval > HZ*NR_CPUS/10)
-                        interval = HZ*NR_CPUS/10;
-                need_serialize = sd->flags & SD_SERIALIZE;
-                if (need_serialize) {
-                        if (!spin_trylock(&balancing))
-                                goto out;
-                }
-                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance)) {
-                                /*
-                                 * We've pulled tasks over so either we're no
-                                 * longer idle, or one of our SMT siblings is
-                                 * not idle.
-                                 */
-                                idle = CPU_NOT_IDLE;
-                        }
-                        sd->last_balance = jiffies;
-                }
-                if (need_serialize)
-                        spin_unlock(&balancing);
-out:
-                if (time_after(next_balance, sd->last_balance + interval)) {
-                        next_balance = sd->last_balance + interval;
-                        update_next_balance = 1;
-                }
-                /*
-                 * Stop the load balance at this level. There is another
-                 * CPU in our sched group which is doing load balancing more
-                 * actively.
-                 */
-                if (!balance)
-                        break;
-        }
-        /*
-         * next_balance will be updated only when there is a need.
-         * When the cpu is attached to null domain for ex, it will not be
-         * updated.
-         */
-        if (likely(update_next_balance))
-                rq->next_balance = next_balance;
-}
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static void run_rebalance_domains(struct softirq_action *h)
-{
-        int this_cpu = smp_processor_id();
-        struct rq *this_rq = cpu_rq(this_cpu);
-        enum cpu_idle_type idle = this_rq->idle_at_tick ?
-                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
-        /*
-         * If this cpu is the owner for idle load balancing, then do the
-         * balancing on behalf of the other idle cpus whose ticks are
-         * stopped.
-         */
-        if (this_rq->idle_at_tick &&
-            atomic_read(&nohz.load_balancer) == this_cpu) {
-                struct rq *rq;
-                int balance_cpu;
-                for_each_cpu(balance_cpu, nohz.cpu_mask) {
-                        if (balance_cpu == this_cpu)
-                                continue;
-                        /*
-                         * If this cpu gets work to do, stop the load balancing
-                         * work being done for other cpus. Next load
-                         * balancing owner will pick it up.
-                         */
-                        if (need_resched())
-                                break;
-                        rebalance_domains(balance_cpu, CPU_IDLE);
-                        rq = cpu_rq(balance_cpu);
-                        if (time_after(this_rq->next_balance, rq->next_balance))
-                                this_rq->next_balance = rq->next_balance;
-                }
-        }
-#endif
-}
-static inline int on_null_domain(int cpu)
-{
-        return !rcu_dereference(cpu_rq(cpu)->sd);
-}
-/*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
- */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
-{
-#ifdef CONFIG_NO_HZ
-        /*
-         * If we were in the nohz mode recently and busy at the current
-         * scheduler tick, then check if we need to nominate new idle
-         * load balancer.
-         */
-        if (rq->in_nohz_recently && !rq->idle_at_tick) {
-                rq->in_nohz_recently = 0;
-                if (atomic_read(&nohz.load_balancer) == cpu) {
-                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                        atomic_set(&nohz.load_balancer, -1);
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        int ilb = find_new_ilb(cpu);
-                        if (ilb < nr_cpu_ids)
-                                resched_cpu(ilb);
-                }
-        }
-        /*
-         * If this cpu is idle and doing idle load balancing for all the
-         * cpus with ticks stopped, is it time for that to stop?
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-                resched_cpu(cpu);
-                return;
-        }
-        /*
-         * If this cpu is idle and the idle load balancing is done by
-         * someone else, then no need raise the SCHED_SOFTIRQ
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-            cpumask_test_cpu(cpu, nohz.cpu_mask))
-                return;
-#endif
-        /* Don't need to rebalance while attached to NULL domain */
-        if (time_after_eq(jiffies, rq->next_balance) &&
-            likely(!on_null_domain(cpu)))
-                raise_softirq(SCHED_SOFTIRQ);
-}
-#else   /* CONFIG_SMP */
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5298,7 +3430,7 @@ void scheduler_tick(void)
        curr->sched_class->task_tick(rq, curr, 0);
        raw_spin_unlock(&rq->lock);
-        perf_event_task_tick(curr, cpu);
+        perf_event_task_tick(curr);
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@ -5412,23 +3544,9 @@ static inline void schedule_debug(struct task_struct *prev)
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        if (prev->state == TASK_RUNNING) {
+        if (prev->se.on_rq)
-                u64 runtime = prev->se.sum_exec_runtime;
+                update_rq_clock(rq);
+        rq->skip_clock_update = 0;
-                runtime -= prev->se.prev_sum_exec_runtime;
-                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-                /*
-                 * In order to avoid avg_overlap growing stale when we are
-                 * indeed overlapping and hence not getting put to sleep, grow
-                 * the avg_overlap on preemption.
-                 *
-                 * We use the average preemption runtime because that
-                 * correlates to the amount of cache footprint a task can
-                 * build up.
-                 */
-                update_avg(&prev->se.avg_overlap, runtime);
-        }
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -5478,7 +3596,7 @@ need_resched:
        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
-        rcu_sched_qs(cpu);
+        rcu_note_context_switch(cpu);
        prev = rq->curr;
        switch_count = &prev->nivcsw;
@@ -5491,14 +3609,13 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        update_rq_clock(rq);
        clear_tsk_need_resched(prev);
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev)))
                        prev->state = TASK_RUNNING;
                else
-                        deactivate_task(rq, prev, 1);
+                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
                switch_count = &prev->nvcsw;
        }
@@ -5512,7 +3629,7 @@ need_resched_nonpreemptible:
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
-                perf_event_task_sched_out(prev, next, cpu);
+                perf_event_task_sched_out(prev, next);
                rq->nr_switches++;
                rq->curr = next;
@@ -5562,7 +3679,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the mutex owner just released it and exited.
         */
        if (probe_kernel_address(&owner->cpu, cpu))
-                goto out;
+                return 0;
 #else
        cpu = owner->cpu;
 #endif
@@ -5572,14 +3689,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the cpu field may no longer be valid.
         */
        if (cpu >= nr_cpumask_bits)
-                goto out;
+                return 0;
        /*
         * We need to validate that we can do a
         * get_cpu() and that we have the percpu area.
         */
        if (!cpu_online(cpu))
-                goto out;
+                return 0;
        rq = cpu_rq(cpu);
@@ -5598,7 +3715,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                cpu_relax();
        }
-out:
        return 1;
 }
 #endif
@@ -5722,6 +3839,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
        __wake_up_common(q, mode, 1, 0, NULL);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
@@ -5821,8 +3939,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
        if (!x->done) {
                DECLARE_WAITQUEUE(wait, current);
-                wait.flags |= WQ_FLAG_EXCLUSIVE;
+                __add_wait_queue_tail_exclusive(&x->wait, &wait);
-                __add_wait_queue_tail(&x->wait, &wait);
                do {
                        if (signal_pending_state(state, current)) {
                                timeout = -ERESTARTSYS;
@@ -5933,6 +4050,23 @@ int __sched wait_for_completion_killable(struct completion *x)
 EXPORT_SYMBOL(wait_for_completion_killable);
 /**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ */
+unsigned long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+                                     unsigned long timeout)
+{
+        return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+/**
 *      try_wait_for_completion - try to decrement a completion without blocking
 *      @x:     completion structure
 *
@@ -6043,14 +4177,14 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
        rq = task_rq_lock(p, &flags);
-        update_rq_clock(rq);
        oldprio = p->prio;
+        prev_class = p->sched_class;
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -6068,7 +4202,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -6090,7 +4224,6 @@ void set_user_nice(struct task_struct *p, long nice)
         * the task might be in the middle of scheduling on another CPU.
         */
        rq = task_rq_lock(p, &flags);
-        update_rq_clock(rq);
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
@@ -6135,7 +4268,7 @@ int can_nice(const struct task_struct *p, const int nice)
        /* convert nice value [19,-20] to rlimit style value [1,40] */
        int nice_rlim = 20 - nice;
-        return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
 }
@@ -6270,7 +4403,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        struct rq *rq;
        int reset_on_fork;
@@ -6312,7 +4445,7 @@ recheck:
                        if (!lock_task_sighand(p, &flags))
                                return -ESRCH;
-                        rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                        rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
                        unlock_task_sighand(p, &flags);
                        /* can't set/change the rt policy */
@@ -6341,16 +4474,6 @@ recheck:
        }
        if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
-                /*
-                 * Do not allow realtime tasks into groups that have no runtime
-                 * assigned.
-                 */
-                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                                task_group(p)->rt_bandwidth.rt_runtime == 0)
-                        return -EPERM;
-#endif
                retval = security_task_setscheduler(p, policy, param);
                if (retval)
                        return retval;
@@ -6366,6 +4489,22 @@ recheck:
         * runqueue lock must be held.
         */
        rq = __task_rq_lock(p);
+#ifdef CONFIG_RT_GROUP_SCHED
+        if (user) {
+                /*
+                 * Do not allow realtime tasks into groups that have no runtime
+                 * assigned.
+                 */
+                if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                                task_group(p)->rt_bandwidth.rt_runtime == 0) {
+                        __task_rq_unlock(rq);
+                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+                        return -EPERM;
+                }
+        }
+#endif
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
@@ -6373,7 +4512,6 @@ recheck:
                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -6384,6 +4522,7 @@ recheck:
        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
+        prev_class = p->sched_class;
        __setscheduler(rq, p, policy, param->sched_priority);
        if (running)
@@ -6683,7 +4822,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        int ret;
        cpumask_var_t mask;
-        if (len < cpumask_size())
+        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+                return -EINVAL;
+        if (len & (sizeof(unsigned long)-1))
                return -EINVAL;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -6691,10 +4832,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        ret = sched_getaffinity(pid, mask);
        if (ret == 0) {
-                if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                size_t retlen = min_t(size_t, len, cpumask_size());
+                if (copy_to_user(user_mask_ptr, mask, retlen))
                        ret = -EFAULT;
                else
-                        ret = cpumask_size();
+                        ret = retlen;
        }
        free_cpumask_var(mask);
@@ -7105,17 +5248,15 @@ static inline void sched_init_granularity(void)
 /*
 * This is how migration works:
 *
- * 1) we queue a struct migration_req structure in the source CPU's
+ * 1) we invoke migration_cpu_stop() on the target CPU using
- *    runqueue and wake up that CPU's migration thread.
+ *    stop_one_cpu().
- * 2) we down() the locked semaphore => thread blocks.
+ * 2) stopper starts to run (implicitly forcing the migrated thread
- * 3) migration thread wakes up (implicitly it forces the migrated
+ *    off the CPU)
- *    thread off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) it gets the migration request and checks whether the migrated
+ * 4) if it's in the wrong runqueue then the migration thread removes
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
 *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
- * 7) we wake up and the migration is done.
+ *    is done.
 */
 /*
@@ -7129,24 +5270,20 @@ static inline void sched_init_granularity(void)
 */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-        struct migration_req req;
        unsigned long flags;
        struct rq *rq;
+        unsigned int dest_cpu;
        int ret = 0;
        /*
-         * Since we rely on wake-ups to migrate sleeping tasks, don't change
+         * Serialize against TASK_WAKING so that ttwu() and wunt() can
-         * the ->cpus_allowed mask from under waking tasks, which would be
+         * drop the rq->lock and still rely on ->cpus_allowed.
-         * possible when we change rq->lock in ttwu(), so synchronize against
-         * TASK_WAKING to avoid that.
         */
 again:
-        while (p->state == TASK_WAKING)
+        while (task_is_waking(p))
                cpu_relax();
        rq = task_rq_lock(p, &flags);
+        if (task_is_waking(p)) {
-        if (p->state == TASK_WAKING) {
                task_rq_unlock(rq, &flags);
                goto again;
        }
@@ -7173,15 +5310,12 @@ again:
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
-        if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+        if (migrate_task(p, dest_cpu)) {
+                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-                struct task_struct *mt = rq->migration_thread;
-                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(rq->migration_thread);
+                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                put_task_struct(mt);
-                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
                return 0;
        }
@@ -7239,98 +5373,49 @@ fail:
        return ret;
 }
-#define RCU_MIGRATION_IDLE      0
-#define RCU_MIGRATION_NEED_QS   1
-#define RCU_MIGRATION_GOT_QS    2
-#define RCU_MIGRATION_MUST_SYNC 3
 /*
- * migration_thread - this is a highprio system thread that performs
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
- * thread migration by bumping thread off CPU then 'pushing' onto
+ * and performs thread migration by bumping thread off CPU then
- * another runqueue.
+ * 'pushing' onto another runqueue.
 */
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
 {
-        int badcpu;
+        struct migration_arg *arg = data;
-        int cpu = (long)data;
-        struct rq *rq;
-        rq = cpu_rq(cpu);
-        BUG_ON(rq->migration_thread != current);
-        set_current_state(TASK_INTERRUPTIBLE);
-        while (!kthread_should_stop()) {
-                struct migration_req *req;
-                struct list_head *head;
-                raw_spin_lock_irq(&rq->lock);
-                if (cpu_is_offline(cpu)) {
-                        raw_spin_unlock_irq(&rq->lock);
-                        break;
-                }
-                if (rq->active_balance) {
-                        active_load_balance(rq, cpu);
-                        rq->active_balance = 0;
-                }
-                head = &rq->migration_queue;
-                if (list_empty(head)) {
-                        raw_spin_unlock_irq(&rq->lock);
-                        schedule();
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        continue;
-                }
-                req = list_entry(head->next, struct migration_req, list);
-                list_del_init(head->next);
-                if (req->task != NULL) {
-                        raw_spin_unlock(&rq->lock);
-                        __migrate_task(req->task, cpu, req->dest_cpu);
-                } else if (likely(cpu == (badcpu = smp_processor_id()))) {
-                        req->dest_cpu = RCU_MIGRATION_GOT_QS;
-                        raw_spin_unlock(&rq->lock);
-                } else {
-                        req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-                        raw_spin_unlock(&rq->lock);
-                        WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-                }
-                local_irq_enable();
-                complete(&req->done);
-        }
-        __set_current_state(TASK_RUNNING);
-        return 0;
-}
-#ifdef CONFIG_HOTPLUG_CPU
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-        int ret;
+        /*
+         * The original target cpu might have gone down and we might
+         * be on another cpu but it doesn't matter.
+         */
        local_irq_disable();
-        ret = __migrate_task(p, src_cpu, dest_cpu);
+        __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
        local_irq_enable();
-        return ret;
+        return 0;
 }
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Figure out where task on dead CPU should go, use force if necessary.
 */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-        int dest_cpu;
+        struct rq *rq = cpu_rq(dead_cpu);
+        int needs_cpu, uninitialized_var(dest_cpu);
+        unsigned long flags;
-again:
+        local_irq_save(flags);
-        dest_cpu = select_fallback_rq(dead_cpu, p);
-        /* It can have affinity changed while we were choosing. */
+        raw_spin_lock(&rq->lock);
-        if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+        needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-                goto again;
+        if (needs_cpu)
+                dest_cpu = select_fallback_rq(dead_cpu, p);
+        raw_spin_unlock(&rq->lock);
+        /*
+         * It can only fail if we race with set_cpus_allowed(),
+         * in the racer should migrate the task anyway.
+         */
+        if (needs_cpu)
+                __migrate_task(p, dead_cpu, dest_cpu);
+        local_irq_restore(flags);
 }
 /*
@@ -7394,7 +5479,6 @@ void sched_idle_next(void)
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-        update_rq_clock(rq);
        activate_task(rq, p, 0);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -7449,7 +5533,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        for ( ; ; ) {
                if (!rq->nr_running)
                        break;
-                update_rq_clock(rq);
                next = pick_next_task(rq);
                if (!next)
                        break;
@@ -7672,35 +5755,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-        struct task_struct *p;
        int cpu = (long)hcpu;
        unsigned long flags;
-        struct rq *rq;
+        struct rq *rq = cpu_rq(cpu);
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-                if (IS_ERR(p))
-                        return NOTIFY_BAD;
-                kthread_bind(p, cpu);
-                /* Must be high prio: stop_machine expects to yield to it. */
-                rq = task_rq_lock(p, &flags);
-                __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-                task_rq_unlock(rq, &flags);
-                get_task_struct(p);
-                cpu_rq(cpu)->migration_thread = p;
                rq->calc_load_update = calc_load_update;
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                /* Strictly unnecessary, as first user will wake it. */
-                wake_up_process(cpu_rq(cpu)->migration_thread);
                /* Update our root-domain */
-                rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7711,61 +5779,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-                if (!cpu_rq(cpu)->migration_thread)
-                        break;
-                /* Unbind it from offline cpu so it can run. Fall thru. */
-                kthread_bind(cpu_rq(cpu)->migration_thread,
-                             cpumask_any(cpu_online_mask));
-                kthread_stop(cpu_rq(cpu)->migration_thread);
-                put_task_struct(cpu_rq(cpu)->migration_thread);
-                cpu_rq(cpu)->migration_thread = NULL;
-                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
                migrate_live_tasks(cpu);
-                rq = cpu_rq(cpu);
-                kthread_stop(rq->migration_thread);
-                put_task_struct(rq->migration_thread);
-                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                raw_spin_lock_irq(&rq->lock);
-                update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
                raw_spin_unlock_irq(&rq->lock);
-                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
                calc_global_load_remove(rq);
-                /*
-                 * No need to migrate the tasks: it was best-effort if
-                 * they didn't take sched_hotcpu_mutex. Just wake up
-                 * the requestors.
-                 */
-                raw_spin_lock_irq(&rq->lock);
-                while (!list_empty(&rq->migration_queue)) {
-                        struct migration_req *req;
-                        req = list_entry(rq->migration_queue.next,
-                                         struct migration_req, list);
-                        list_del_init(&req->list);
-                        raw_spin_unlock_irq(&rq->lock);
-                        complete(&req->done);
-                        raw_spin_lock_irq(&rq->lock);
-                }
-                raw_spin_unlock_irq(&rq->lock);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /* Update our root-domain */
-                rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -8096,6 +6127,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
+        for (tmp = sd; tmp; tmp = tmp->parent)
+                tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -9202,11 +7236,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
                                           char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                            struct sysdev_class_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
@@ -9218,11 +7254,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
                                            char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                             struct sysdev_class_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
@@ -9437,7 +7475,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        tg->rt_rq[cpu] = rt_rq;
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
-        rt_rq->rt_se = rt_se;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
        if (add)
                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9468,9 +7505,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_USER_SCHED
-        alloc_size *= 2;
-#endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
        alloc_size += num_possible_cpus() * cpumask_size();
 #endif
@@ -9484,13 +7518,6 @@ void __init sched_init(void)
                init_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.se = (struct sched_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9499,13 +7526,6 @@ void __init sched_init(void)
                init_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.rt_rq = (struct rt_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
@@ -9525,22 +7545,13 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&init_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_USER_SCHED
-        init_rt_bandwidth(&root_task_group.rt_bandwidth,
-                        global_rt_period(), RUNTIME_INF);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
        INIT_LIST_HEAD(&init_task_group.children);
-#ifdef CONFIG_USER_SCHED
+#endif /* CONFIG_CGROUP_SCHED */
-        INIT_LIST_HEAD(&root_task_group.children);
-        init_task_group.parent = &root_task_group;
-        list_add(&init_task_group.siblings, &root_task_group.children);
-#endif /* CONFIG_USER_SCHED */
-#endif /* CONFIG_GROUP_SCHED */
 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9580,25 +7591,6 @@ void __init sched_init(void)
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                 */
                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                root_task_group.shares = NICE_0_LOAD;
-                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
-                /*
-                 * In case of task-groups formed thr' the user id of tasks,
-                 * init_task_group represents tasks belonging to root user.
-                 * Hence it forms a sibling of all subsequent groups formed.
-                 * In this case, init_task_group gets only a fraction of overall
-                 * system cpu resource, based on the weight assigned to root
-                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
-                 * by letting tasks of init_task_group sit in a separate cfs_rq
-                 * (init_tg_cfs_rq) and having one entity represent this group of
-                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
-                 */
-                init_tg_cfs_entry(&init_task_group,
-                                &per_cpu(init_tg_cfs_rq, i),
-                                &per_cpu(init_sched_entity, i), i, 1,
-                                root_task_group.se[i]);
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -9607,12 +7599,6 @@ void __init sched_init(void)
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
-                init_tg_rt_entry(&init_task_group,
-                                &per_cpu(init_rt_rq_var, i),
-                                &per_cpu(init_sched_rt_entity, i), i, 1,
-                                root_task_group.rt_se[i]);
 #endif
 #endif
@@ -9621,16 +7607,15 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
+                rq->cpu_power = SCHED_LOAD_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
                rq->cpu = i;
                rq->online = 0;
-                rq->migration_thread = NULL;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
-                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
 #endif
                init_rq_hrtick(rq);
@@ -9697,7 +7682,7 @@ static inline int preempt_count_equals(int preempt_offset)
        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
-void __might_sleep(char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
@@ -9731,7 +7716,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        int on_rq;
-        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -9758,9 +7742,9 @@ void normalize_rt_tasks(void)
                p->se.exec_start                = 0;
 #ifdef CONFIG_SCHEDSTATS
-                p->se.wait_start                = 0;
+                p->se.statistics.wait_start     = 0;
-                p->se.sleep_start               = 0;
+                p->se.statistics.sleep_start    = 0;
-                p->se.block_start               = 0;
+                p->se.statistics.block_start    = 0;
 #endif
                if (!rt_task(p)) {
@@ -9787,9 +7771,9 @@ void normalize_rt_tasks(void)
 #endif /* CONFIG_MAGIC_SYSRQ */
-#ifdef CONFIG_IA64
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
 /*
- * These functions are only useful for the IA64 MCA handling.
+ * These functions are only useful for the IA64 MCA handling, or kdb.
 *
 * They can only be called when the whole system has been
 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -9809,6 +7793,9 @@ struct task_struct *curr_task(int cpu)
        return cpu_curr(cpu);
 }
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+#ifdef CONFIG_IA64
 /**
 * set_curr_task - set the current task for a given cpu.
 * @cpu: the processor in question.
@@ -10008,7 +7995,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
@@ -10093,8 +8080,6 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
-        update_rq_clock(rq);
        running = task_current(rq, tsk);
        on_rq = tsk->se.on_rq;
@@ -10117,7 +8102,7 @@ void sched_move_task(struct task_struct *tsk)
        task_rq_unlock(rq, &flags);
 }
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10259,13 +8244,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
                runtime = d->rt_runtime;
        }
-#ifdef CONFIG_USER_SCHED
-        if (tg == &root_task_group) {
-                period = global_rt_period();
-                runtime = global_rt_runtime();
-        }
-#endif
        /*
         * Cannot have more runtime than the period.
         */
@@ -10668,7 +8646,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 *cpuusage;
+        u64 __percpu *cpuusage;
        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
@@ -10885,12 +8863,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 }
 /*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH   \
+        min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH   0
+#endif
+/*
 * Charge the system/user time to the task's accounting group.
 */
 static void cpuacct_update_stats(struct task_struct *tsk,
                enum cpuacct_stat_index idx, cputime_t val)
 {
        struct cpuacct *ca;
+        int batch = CPUACCT_BATCH;
        if (unlikely(!cpuacct_subsys.active))
                return;
@@ -10899,7 +8895,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
        ca = task_ca(tsk);
        do {
-                percpu_counter_add(&ca->cpustat[idx], val);
+                __percpu_counter_add(&ca->cpustat[idx], val, batch);
                ca = ca->parent;
        } while (ca);
        rcu_read_unlock();
@@ -10916,43 +8912,32 @@ struct cgroup_subsys cpuacct_subsys = {
 #ifndef CONFIG_SMP
-int rcu_expedited_torture_stats(char *page)
-{
-        return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
 void synchronize_sched_expedited(void)
 {
+        barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #else /* #ifndef CONFIG_SMP */
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-        int cnt = 0;
+        /*
-        int cpu;
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
-        cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+         * time that it returns.
-        for_each_online_cpu(cpu) {
+         *
-                 cnt += sprintf(&page[cnt], " %d:%d",
+         * In the current initial implementation of cpu_stop, the
-                                cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+         * above condition is already met when the control reaches
-        }
+         * this point and the following smp_mb() is not strictly
-        cnt += sprintf(&page[cnt], "\n");
+         * necessary.  Do smp_mb() anyway for documentation and
-        return cnt;
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-static long synchronize_sched_expedited_count;
 /*
 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -10966,18 +8951,14 @@ static long synchronize_sched_expedited_count;
 */
 void synchronize_sched_expedited(void)
 {
-        int cpu;
+        int snap, trycount = 0;
-        unsigned long flags;
-        bool need_full_sync = 0;
-        struct rq *rq;
-        struct migration_req *req;
-        long snap;
-        int trycount = 0;
        smp_mb();  /* ensure prior mod happens before capturing snap. */
-        snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+        snap = atomic_read(&synchronize_sched_expedited_count) + 1;
        get_online_cpus();
-        while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             NULL) == -EAGAIN) {
                put_online_cpus();
                if (trycount++ < 10)
                        udelay(trycount * num_online_cpus());
@@ -10985,41 +8966,15 @@ void synchronize_sched_expedited(void)
                        synchronize_sched();
                        return;
                }
-                if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
                        smp_mb(); /* ensure test happens before caller kfree */
                        return;
                }
                get_online_cpus();
        }
-        rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+        atomic_inc(&synchronize_sched_expedited_count);
-        for_each_online_cpu(cpu) {
+        smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-                rq = cpu_rq(cpu);
-                req = &per_cpu(rcu_migration_req, cpu);
-                init_completion(&req->done);
-                req->task = NULL;
-                req->dest_cpu = RCU_MIGRATION_NEED_QS;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                list_add(&req->list, &rq->migration_queue);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                wake_up_process(rq->migration_thread);
-        }
-        for_each_online_cpu(cpu) {
-                rcu_expedited_state = cpu;
-                req = &per_cpu(rcu_migration_req, cpu);
-                rq = cpu_rq(cpu);
-                wait_for_completion(&req->done);
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-                        need_full_sync = 1;
-                req->dest_cpu = RCU_MIGRATION_IDLE;
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-        rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-        synchronize_sched_expedited_count++;
-        mutex_unlock(&rcu_sched_expedited_mutex);
        put_online_cpus();
-        if (need_full_sync)
-                synchronize_sched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
        return (unsigned long long)(jiffies - INITIAL_JIFFIES)
                                        * (NSEC_PER_SEC / HZ);
 }
+EXPORT_SYMBOL_GPL(sched_clock);
 static __read_mostly int sched_clock_running;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
 *  of the License.
 */
+#include <linux/gfp.h>
 #include "sched_cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,9 +48,7 @@ static int convert_prio(int prio)
 }
 #define for_each_cpupri_active(array, idx)                    \
-  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
-       idx < CPUPRI_NR_PRIORITIES;                            \
-       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +57,7 @@ static int convert_prio(int prio)
 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
- * current invokation.  By the time the call returns, the CPUs may have in
+ * current invocation.  By the time the call returns, the CPUs may have in
 * fact changed priorities any number of times.  While not ideal, it is not
 * an issue of correctness since the normal rebalancer logic will correct
 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..35565395d00d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-        PN(se->wait_start);
+        PN(se->statistics.wait_start);
-        PN(se->sleep_start);
+        PN(se->statistics.sleep_start);
-        PN(se->block_start);
+        PN(se->statistics.block_start);
-        PN(se->sleep_max);
+        PN(se->statistics.sleep_max);
-        PN(se->block_max);
+        PN(se->statistics.block_max);
-        PN(se->exec_max);
+        PN(se->statistics.exec_max);
-        PN(se->slice_max);
+        PN(se->statistics.slice_max);
-        PN(se->wait_max);
+        PN(se->statistics.wait_max);
-        PN(se->wait_sum);
+        PN(se->statistics.wait_sum);
-        P(se->wait_count);
+        P(se->statistics.wait_count);
 #endif
        P(se->load.weight);
 #undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                SPLIT_NS(p->se.vruntime),
                SPLIT_NS(p->se.sum_exec_runtime),
-                SPLIT_NS(p->se.sum_sleep_runtime));
+                SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        {
                char path[64];
+                rcu_read_lock();
                cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+                rcu_read_unlock();
                SEQ_printf(m, " %s", path);
        }
 #endif
@@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        task_group_path(tg, path, sizeof(path));
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-        {
-                uid_t uid = cfs_rq->tg->uid;
-                SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
-        }
 #else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
@@ -384,15 +381,9 @@ __initcall(init_sched_debug_procfs);
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
        unsigned long nr_switches;
-        unsigned long flags;
-        int num_threads = 1;
-        if (lock_task_sighand(p, &flags)) {
-                num_threads = atomic_read(&p->signal->count);
-                unlock_task_sighand(p, &flags);
-        }
-        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+                                                get_nr_threads(p));
        SEQ_printf(m,
                "---------------------------------------------------------\n");
 #define __P(F) \
@@ -407,40 +398,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.exec_start);
        PN(se.vruntime);
        PN(se.sum_exec_runtime);
-        PN(se.avg_overlap);
-        PN(se.avg_wakeup);
        nr_switches = p->nvcsw + p->nivcsw;
 #ifdef CONFIG_SCHEDSTATS
-        PN(se.wait_start);
+        PN(se.statistics.wait_start);
-        PN(se.sleep_start);
+        PN(se.statistics.sleep_start);
-        PN(se.block_start);
+        PN(se.statistics.block_start);
-        PN(se.sleep_max);
+        PN(se.statistics.sleep_max);
-        PN(se.block_max);
+        PN(se.statistics.block_max);
-        PN(se.exec_max);
+        PN(se.statistics.exec_max);
-        PN(se.slice_max);
+        PN(se.statistics.slice_max);
-        PN(se.wait_max);
+        PN(se.statistics.wait_max);
-        PN(se.wait_sum);
+        PN(se.statistics.wait_sum);
-        P(se.wait_count);
+        P(se.statistics.wait_count);
-        PN(se.iowait_sum);
+        PN(se.statistics.iowait_sum);
-        P(se.iowait_count);
+        P(se.statistics.iowait_count);
        P(sched_info.bkl_count);
        P(se.nr_migrations);
-        P(se.nr_migrations_cold);
+        P(se.statistics.nr_migrations_cold);
-        P(se.nr_failed_migrations_affine);
+        P(se.statistics.nr_failed_migrations_affine);
-        P(se.nr_failed_migrations_running);
+        P(se.statistics.nr_failed_migrations_running);
-        P(se.nr_failed_migrations_hot);
+        P(se.statistics.nr_failed_migrations_hot);
-        P(se.nr_forced_migrations);
+        P(se.statistics.nr_forced_migrations);
-        P(se.nr_wakeups);
+        P(se.statistics.nr_wakeups);
-        P(se.nr_wakeups_sync);
+        P(se.statistics.nr_wakeups_sync);
-        P(se.nr_wakeups_migrate);
+        P(se.statistics.nr_wakeups_migrate);
-        P(se.nr_wakeups_local);
+        P(se.statistics.nr_wakeups_local);
-        P(se.nr_wakeups_remote);
+        P(se.statistics.nr_wakeups_remote);
-        P(se.nr_wakeups_affine);
+        P(se.statistics.nr_wakeups_affine);
-        P(se.nr_wakeups_affine_attempts);
+        P(se.statistics.nr_wakeups_affine_attempts);
-        P(se.nr_wakeups_passive);
+        P(se.statistics.nr_wakeups_passive);
-        P(se.nr_wakeups_idle);
+        P(se.statistics.nr_wakeups_idle);
        {
                u64 avg_atom, avg_per_cpu;
@@ -491,35 +480,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-        p->se.wait_max                          = 0;
+        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-        p->se.wait_sum                          = 0;
-        p->se.wait_count                        = 0;
-        p->se.iowait_sum                        = 0;
-        p->se.iowait_count                      = 0;
-        p->se.sleep_max                         = 0;
-        p->se.sum_sleep_runtime                 = 0;
-        p->se.block_max                         = 0;
-        p->se.exec_max                          = 0;
-        p->se.slice_max                         = 0;
-        p->se.nr_migrations                     = 0;
-        p->se.nr_migrations_cold                = 0;
-        p->se.nr_failed_migrations_affine       = 0;
-        p->se.nr_failed_migrations_running      = 0;
-        p->se.nr_failed_migrations_hot          = 0;
-        p->se.nr_forced_migrations              = 0;
-        p->se.nr_wakeups                        = 0;
-        p->se.nr_wakeups_sync                   = 0;
-        p->se.nr_wakeups_migrate                = 0;
-        p->se.nr_wakeups_local                  = 0;
-        p->se.nr_wakeups_remote                 = 0;
-        p->se.nr_wakeups_affine                 = 0;
-        p->se.nr_wakeups_affine_attempts        = 0;
-        p->se.nr_wakeups_passive                = 0;
-        p->se.nr_wakeups_idle                   = 0;
-        p->sched_info.bkl_count                 = 0;
 #endif
-        p->se.sum_exec_runtime                  = 0;
-        p->se.prev_sum_exec_runtime             = 0;
-        p->nvcsw                                = 0;
-        p->nivcsw                               = 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..a878b5332daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
 */
-unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 /*
 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 /*
 * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 */
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
 /*
 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 {
        unsigned long delta_exec_weighted;
-        schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+        schedstat_set(curr->statistics.exec_max,
+                      max((u64)delta_exec, curr->statistics.exec_max));
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+        schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 }
 /*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        schedstat_set(se->wait_max, max(se->wait_max,
+        schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-                        rq_of(cfs_rq)->clock - se->wait_start));
+                        rq_of(cfs_rq)->clock - se->statistics.wait_start));
-        schedstat_set(se->wait_count, se->wait_count + 1);
+        schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-        schedstat_set(se->wait_sum, se->wait_sum +
+        schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-                        rq_of(cfs_rq)->clock - se->wait_start);
+                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
        if (entity_is_task(se)) {
                trace_sched_stat_wait(task_of(se),
-                        rq_of(cfs_rq)->clock - se->wait_start);
+                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
        }
 #endif
-        schedstat_set(se->wait_start, 0);
+        schedstat_set(se->statistics.wait_start, 0);
 }
 static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (entity_is_task(se))
                tsk = task_of(se);
-        if (se->sleep_start) {
+        if (se->statistics.sleep_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+                u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
                if ((s64)delta < 0)
                        delta = 0;
-                if (unlikely(delta > se->sleep_max))
+                if (unlikely(delta > se->statistics.sleep_max))
-                        se->sleep_max = delta;
+                        se->statistics.sleep_max = delta;
-                se->sleep_start = 0;
+                se->statistics.sleep_start = 0;
-                se->sum_sleep_runtime += delta;
+                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
                        account_scheduler_latency(tsk, delta >> 10, 1);
                        trace_sched_stat_sleep(tsk, delta);
                }
        }
-        if (se->block_start) {
+        if (se->statistics.block_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+                u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
                if ((s64)delta < 0)
                        delta = 0;
-                if (unlikely(delta > se->block_max))
+                if (unlikely(delta > se->statistics.block_max))
-                        se->block_max = delta;
+                        se->statistics.block_max = delta;
-                se->block_start = 0;
+                se->statistics.block_start = 0;
-                se->sum_sleep_runtime += delta;
+                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
                        if (tsk->in_iowait) {
-                                se->iowait_sum += delta;
+                                se->statistics.iowait_sum += delta;
-                                se->iowait_count++;
+                                se->statistics.iowait_count++;
                                trace_sched_stat_iowait(tsk, delta);
                        }
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                vruntime += sched_vslice(cfs_rq, se);
        /* sleeps up to a single latency don't count. */
-        if (!initial && sched_feat(FAIR_SLEEPERS)) {
+        if (!initial) {
                unsigned long thresh = sysctl_sched_latency;
                /*
-                 * Convert the sleeper threshold into virtual time.
-                 * SCHED_IDLE is a special sub-class.  We care about
-                 * fairness only relative to other SCHED_IDLE tasks,
-                 * all of which have the same weight.
-                 */
-                if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
-                                 task_of(se)->policy != SCHED_IDLE))
-                        thresh = calc_delta_fair(thresh, se);
-                /*
                 * Halve their sleep time's effect, to allow
                 * for a gentler effect of sleepers:
                 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
-#define ENQUEUE_WAKEUP  1
-#define ENQUEUE_MIGRATE 2
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update the normalized vruntime before updating min_vruntime
         * through callig update_curr().
         */
-        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
                se->vruntime += cfs_rq->min_vruntime;
        /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
         * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        update_curr(cfs_rq);
        update_stats_dequeue(cfs_rq, se);
-        if (sleep) {
+        if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
                        if (tsk->state & TASK_INTERRUPTIBLE)
-                                se->sleep_start = rq_of(cfs_rq)->clock;
+                                se->statistics.sleep_start = rq_of(cfs_rq)->clock;
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
-                                se->block_start = rq_of(cfs_rq)->clock;
+                                se->statistics.block_start = rq_of(cfs_rq)->clock;
                }
 #endif
        }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
         * update can refer to the ->curr item and we need to reflect this
         * movement in our normalized position.
         */
-        if (!sleep)
+        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
 }
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * when there are only lesser-weight tasks around):
         */
        if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-                se->slice_max = max(se->slice_max,
+                se->statistics.slice_max = max(se->statistics.slice_max,
                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
        }
 #endif
@@ -1053,16 +1041,11 @@ static inline void hrtick_update(struct rq *rq)
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
-        int flags = 0;
-        if (wakeup)
-                flags |= ENQUEUE_WAKEUP;
-        if (p->state == TASK_WAKING)
-                flags |= ENQUEUE_MIGRATE;
        for_each_sched_entity(se) {
                if (se->on_rq)
@@ -1080,18 +1063,18 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
-                dequeue_entity(cfs_rq, se, sleep);
+                dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
-                sleep = 1;
+                flags |= DEQUEUE_SLEEP;
        }
        hrtick_update(rq);
@@ -1239,11 +1222,9 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-        struct task_struct *curr = current;
        unsigned long this_load, load;
        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
-        unsigned int imbalance;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
@@ -1254,23 +1235,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
-        if (sync) {
-               if (sched_feat(SYNC_LESS) &&
-                   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-                    p->se.avg_overlap > sysctl_sched_migration_cost))
-                       sync = 0;
-        } else {
-                if (sched_feat(SYNC_MORE) &&
-                    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                     p->se.avg_overlap < sysctl_sched_migration_cost))
-                        sync = 1;
-        }
        /*
         * If sync wakeup then subtract the (maximum possible)
         * effect of the currently running task from the load
         * of the current CPU:
         */
+        rcu_read_lock();
        if (sync) {
                tg = task_group(current);
                weight = current->se.load.weight;
@@ -1282,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        tg = task_group(p);
        weight = p->se.load.weight;
-        imbalance = 100 + (sd->imbalance_pct - 100) / 2;
        /*
         * In low-load situations, where prev_cpu is idle and this_cpu is idle
         * due to the sync cause above having dropped this_load to 0, we'll
@@ -1293,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        balanced = !this_load ||
+        if (this_load) {
-                100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
+                unsigned long this_eff_load, prev_eff_load;
-                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
+                this_eff_load = 100;
+                this_eff_load *= power_of(prev_cpu);
+                this_eff_load *= this_load +
+                        effective_load(tg, this_cpu, weight, weight);
+                prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+                prev_eff_load *= power_of(this_cpu);
+                prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+                balanced = this_eff_load <= prev_eff_load;
+        } else
+                balanced = true;
+        rcu_read_unlock();
        /*
         * If the currently running task will sleep within
@@ -1305,7 +1286,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        if (sync && balanced)
                return 1;
-        schedstat_inc(p, se.nr_wakeups_affine_attempts);
+        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
        if (balanced ||
@@ -1317,7 +1298,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                 * there is no bad imbalance.
                 */
                schedstat_inc(sd, ttwu_move_affine);
-                schedstat_inc(p, se.nr_wakeups_affine);
+                schedstat_inc(p, se.statistics.nr_wakeups_affine);
                return 1;
        }
@@ -1405,29 +1386,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
 * Try and locate an idle CPU in the sched_domain.
 */
-static int
+static int select_idle_sibling(struct task_struct *p, int target)
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
 {
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
+        struct sched_domain *sd;
        int i;
        /*
-         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * If the task is going to be woken-up on this cpu and if it is
-         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * already idle, then it is the right target.
-         * always a better target than the current cpu.
         */
-        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+        if (target == cpu && idle_cpu(cpu))
+                return cpu;
+        /*
+         * If the task is going to be woken-up on the cpu where it previously
+         * ran and if it is currently idle, then it the right target.
+         */
+        if (target == prev_cpu && idle_cpu(prev_cpu))
                return prev_cpu;
        /*
-         * Otherwise, iterate the domain and find an elegible idle cpu.
+         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
-        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+        for_each_domain(target, sd) {
-                if (!cpu_rq(i)->cfs.nr_running) {
+                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
-                        target = i;
                        break;
+                for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                        if (idle_cpu(i)) {
+                                target = i;
+                                break;
+                        }
                }
+                /*
+                 * Lets stop looking for an idle sibling when we reached
+                 * the domain that spans the current cpu and prev_cpu.
+                 */
+                if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+                        break;
        }
        return target;
@@ -1444,7 +1444,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
 *
 * preempt must be disabled.
 */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
@@ -1455,8 +1456,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
        int sync = wake_flags & WF_SYNC;
        if (sd_flag & SD_BALANCE_WAKE) {
-                if (sched_feat(AFFINE_WAKEUPS) &&
+                if (cpumask_test_cpu(cpu, &p->cpus_allowed))
-                    cpumask_test_cpu(cpu, &p->cpus_allowed))
                        want_affine = 1;
                new_cpu = prev_cpu;
        }
@@ -1490,34 +1490,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                }
                /*
-                 * While iterating the domains looking for a spanning
+                 * If both cpu and prev_cpu are part of this domain,
-                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * cpu is a valid SD_WAKE_AFFINE target.
-                 * in cache sharing domains along the way.
                 */
-                if (want_affine) {
+                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-                        int target = -1;
+                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                        affine_sd = tmp;
-                        /*
+                        want_affine = 0;
-                         * If both cpu and prev_cpu are part of this domain,
-                         * cpu is a valid SD_WAKE_AFFINE target.
-                         */
-                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-                                target = cpu;
-                        /*
-                         * If there's an idle sibling in this domain, make that
-                         * the wake_affine target instead of the current cpu.
-                         */
-                        if (tmp->flags & SD_SHARE_PKG_RESOURCES)
-                                target = select_idle_sibling(p, tmp, target);
-                        if (target >= 0) {
-                                if (tmp->flags & SD_WAKE_AFFINE) {
-                                        affine_sd = tmp;
-                                        want_affine = 0;
-                                }
-                                cpu = target;
-                        }
                }
                if (!want_sd && !want_affine)
@@ -1530,22 +1509,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        sd = tmp;
        }
+#ifdef CONFIG_FAIR_GROUP_SCHED
        if (sched_feat(LB_SHARES_UPDATE)) {
                /*
                 * Pick the largest domain to update shares over
                 */
                tmp = sd;
-                if (affine_sd && (!tmp ||
+                if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                                  cpumask_weight(sched_domain_span(affine_sd)) >
-                                  cpumask_weight(sched_domain_span(sd))))
                        tmp = affine_sd;
-                if (tmp)
+                if (tmp) {
+                        raw_spin_unlock(&rq->lock);
                        update_shares(tmp);
+                        raw_spin_lock(&rq->lock);
+                }
        }
+#endif
-        if (affine_sd && wake_affine(affine_sd, p, sync))
+        if (affine_sd) {
-                return cpu;
+                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+                        return select_idle_sibling(p, cpu);
+                else
+                        return select_idle_sibling(p, prev_cpu);
+        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1575,10 +1561,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
-                weight = cpumask_weight(sched_domain_span(sd));
+                weight = sd->span_weight;
                sd = NULL;
                for_each_domain(cpu, tmp) {
-                        if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                        if (weight <= tmp->span_weight)
                                break;
                        if (tmp->flags & sd_flag)
                                sd = tmp;
@@ -1590,63 +1576,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 }
 #endif /* CONFIG_SMP */
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- *       degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-        u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-        u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
-        u64 gran = 0;
-        if (this_run < expected_wakeup)
-                gran = expected_wakeup - this_run;
-        return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
        unsigned long gran = sysctl_sched_wakeup_granularity;
-        if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
-                gran = adaptive_gran(curr, se);
        /*
         * Since its curr running now, convert the gran from real-time
         * to virtual-time in his units.
+         *
+         * By using 'se' instead of 'curr' we penalize light tasks, so
+         * they get preempted easier. That is, if 'se' < 'curr' then
+         * the resulting gran will be larger, therefore penalizing the
+         * lighter, if otoh 'se' > 'curr' then the resulting gran will
+         * be smaller, again penalizing the lighter task.
+         *
+         * This is especially important for buddies when the leftmost
+         * task is higher priority than the buddy.
         */
-        if (sched_feat(ASYM_GRAN)) {
+        if (unlikely(se->load.weight != NICE_0_LOAD))
-                /*
+                gran = calc_delta_fair(gran, se);
-                 * By using 'se' instead of 'curr' we penalize light tasks, so
-                 * they get preempted easier. That is, if 'se' < 'curr' then
-                 * the resulting gran will be larger, therefore penalizing the
-                 * lighter, if otoh 'se' > 'curr' then the resulting gran will
-                 * be smaller, again penalizing the lighter task.
-                 *
-                 * This is especially important for buddies when the leftmost
-                 * task is higher priority than the buddy.
-                 */
-                if (unlikely(se->load.weight != NICE_0_LOAD))
-                        gran = calc_delta_fair(gran, se);
-        } else {
-                if (unlikely(curr->load.weight != NICE_0_LOAD))
-                        gran = calc_delta_fair(gran, curr);
-        }
        return gran;
 }
@@ -1704,7 +1653,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
        if (unlikely(rt_prio(p->prio)))
@@ -1737,14 +1685,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(curr->policy == SCHED_IDLE))
                goto preempt;
-        if (sched_feat(WAKEUP_SYNC) && sync)
-                goto preempt;
-        if (sched_feat(WAKEUP_OVERLAP) &&
-                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        pse->avg_overlap < sysctl_sched_migration_cost)
-                goto preempt;
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
@@ -1815,57 +1755,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 */
 /*
- * Load-balancing iterator. Note: while the runqueue stays locked
+ * pull_task - move a task from a remote runqueue to the local runqueue.
- * during the whole iteration, the current task might be
+ * Both runqueues must be locked.
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
 */
-static struct task_struct *
+static void pull_task(struct rq *src_rq, struct task_struct *p,
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
+                      struct rq *this_rq, int this_cpu)
 {
-        struct task_struct *p = NULL;
+        deactivate_task(src_rq, p, 0);
-        struct sched_entity *se;
+        set_task_cpu(p, this_cpu);
+        activate_task(this_rq, p, 0);
+        check_preempt_curr(this_rq, p, 0);
+}
-        if (next == &cfs_rq->tasks)
+/*
-                return NULL;
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
+                     int *all_pinned)
+{
+        int tsk_cache_hot = 0;
+        /*
+         * We do not migrate tasks that are:
+         * 1) running (obviously), or
+         * 2) cannot be migrated to this CPU due to cpus_allowed, or
+         * 3) are cache-hot on their current CPU.
+         */
+        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                return 0;
+        }
+        *all_pinned = 0;
-        se = list_entry(next, struct sched_entity, group_node);
+        if (task_running(rq, p)) {
-        p = task_of(se);
+                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
-        cfs_rq->balance_iterator = next->next;
+                return 0;
+        }
-        return p;
+        /*
-}
+         * Aggressive migration if:
+         * 1) task is cache cold, or
+         * 2) too many balance attempts have failed.
+         */
-static struct task_struct *load_balance_start_fair(void *arg)
+        tsk_cache_hot = task_hot(p, rq->clock, sd);
-{
+        if (!tsk_cache_hot ||
-        struct cfs_rq *cfs_rq = arg;
+                sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(sd, lb_hot_gained[idle]);
+                        schedstat_inc(p, se.statistics.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
-        return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
+        if (tsk_cache_hot) {
+                schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+                return 0;
+        }
+        return 1;
 }
-static struct task_struct *load_balance_next_fair(void *arg)
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+              struct sched_domain *sd, enum cpu_idle_type idle)
 {
-        struct cfs_rq *cfs_rq = arg;
+        struct task_struct *p, *n;
+        struct cfs_rq *cfs_rq;
+        int pinned = 0;
-        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
+        for_each_leaf_cfs_rq(busiest, cfs_rq) {
+                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                        if (!can_migrate_task(p, busiest, this_cpu,
+                                                sd, idle, &pinned))
+                                continue;
+                        pull_task(busiest, p, this_rq, this_cpu);
+                        /*
+                         * Right now, this is only the second place pull_task()
+                         * is called, so we can safely collect pull_task()
+                         * stats here rather than inside pull_task().
+                         */
+                        schedstat_inc(sd, lb_gained[idle]);
+                        return 1;
+                }
+        }
+        return 0;
 }
 static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move, struct sched_domain *sd,
+              unsigned long max_load_move, struct sched_domain *sd,
-                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+              enum cpu_idle_type idle, int *all_pinned,
-                struct cfs_rq *cfs_rq)
+              int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-        struct rq_iterator cfs_rq_iterator;
+        int loops = 0, pulled = 0, pinned = 0;
+        long rem_load_move = max_load_move;
+        struct task_struct *p, *n;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        if (max_load_move == 0)
-        cfs_rq_iterator.next = load_balance_next_fair;
+                goto out;
-        cfs_rq_iterator.arg = cfs_rq;
-        return balance_tasks(this_rq, this_cpu, busiest,
+        pinned = 1;
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &cfs_rq_iterator);
+        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+                if (loops++ > sysctl_sched_nr_migrate)
+                        break;
+                if ((p->se.load.weight >> 1) > rem_load_move ||
+                    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+                        continue;
+                pull_task(busiest, p, this_rq, this_cpu);
+                pulled++;
+                rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+                /*
+                 * NEWIDLE balancing is a source of latency, so preemptible
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
+                 */
+                if (idle == CPU_NEWLY_IDLE)
+                        break;
+#endif
+                /*
+                 * We only want to steal up to the prescribed amount of
+                 * weighted load.
+                 */
+                if (rem_load_move <= 0)
+                        break;
+                if (p->prio < *this_best_prio)
+                        *this_best_prio = p->prio;
+        }
+out:
+        /*
+         * Right now, this is one of only two places pull_task() is called,
+         * so we can safely collect pull_task() stats here rather than
+         * inside pull_task().
+         */
+        schedstat_add(sd, lb_gained[idle], pulled);
+        if (all_pinned)
+                *all_pinned = pinned;
+        return max_load_move - rem_load_move;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +1944,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                rem_load = (u64)rem_load_move * busiest_weight;
                rem_load = div_u64(rem_load, busiest_h_load + 1);
-                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                moved_load = balance_tasks(this_rq, this_cpu, busiest,
                                rem_load, sd, idle, all_pinned, this_best_prio,
-                                tg->cfs_rq[busiest_cpu]);
+                                busiest_cfs_rq);
                if (!moved_load)
                        continue;
@@ -1922,35 +1969,1528 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
-        return __load_balance_fair(this_rq, this_cpu, busiest,
+        return balance_tasks(this_rq, this_cpu, busiest,
                        max_load_move, sd, idle, all_pinned,
                        this_best_prio, &busiest->cfs);
 }
 #endif
-static int
+/*
-move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
-                   struct sched_domain *sd, enum cpu_idle_type idle)
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                      unsigned long max_load_move,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
+                      int *all_pinned)
+{
+        unsigned long total_load_moved = 0, load_moved;
+        int this_best_prio = this_rq->curr->prio;
+        do {
+                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
+                                max_load_move - total_load_moved,
+                                sd, idle, all_pinned, &this_best_prio);
+                total_load_moved += load_moved;
+#ifdef CONFIG_PREEMPT
+                /*
+                 * NEWIDLE balancing is a source of latency, so preemptible
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
+                 */
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+                        break;
+                if (raw_spin_is_contended(&this_rq->lock) ||
+                                raw_spin_is_contended(&busiest->lock))
+                        break;
+#endif
+        } while (load_moved && max_load_move > total_load_moved);
+        return total_load_moved > 0;
+}
+/********** Helpers for find_busiest_group ************************/
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *              during load balancing.
+ */
+struct sd_lb_stats {
+        struct sched_group *busiest; /* Busiest group in this sd */
+        struct sched_group *this;  /* Local group in this sd */
+        unsigned long total_load;  /* Total load of all groups in sd */
+        unsigned long total_pwr;   /*   Total power of all groups in sd */
+        unsigned long avg_load;    /* Average load across all groups in sd */
+        /** Statistics of this group */
+        unsigned long this_load;
+        unsigned long this_load_per_task;
+        unsigned long this_nr_running;
+        /* Statistics of the busiest group */
+        unsigned long max_load;
+        unsigned long busiest_load_per_task;
+        unsigned long busiest_nr_running;
+        unsigned long busiest_group_capacity;
+        int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+        int power_savings_balance; /* Is powersave balance needed for this sd */
+        struct sched_group *group_min; /* Least loaded group in sd */
+        struct sched_group *group_leader; /* Group which relieves group_min */
+        unsigned long min_load_per_task; /* load_per_task in group_min */
+        unsigned long leader_nr_running; /* Nr running of group_leader */
+        unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+        unsigned long avg_load; /*Avg load across the CPUs of the group */
+        unsigned long group_load; /* Total load over the CPUs of the group */
+        unsigned long sum_nr_running; /* Nr tasks running in the group */
+        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+        unsigned long group_capacity;
+        int group_imb; /* Is there an imbalance in the group ? */
+};
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+        return cpumask_first(sched_group_cpus(group));
+}
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+                                        enum cpu_idle_type idle)
+{
+        int load_idx;
+        switch (idle) {
+        case CPU_NOT_IDLE:
+                load_idx = sd->busy_idx;
+                break;
+        case CPU_NEWLY_IDLE:
+                load_idx = sd->newidle_idx;
+                break;
+        default:
+                load_idx = sd->idle_idx;
+                break;
+        }
+        return load_idx;
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        /*
+         * Busy processors will not participate in power savings
+         * balance.
+         */
+        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                sds->power_savings_balance = 0;
+        else {
+                sds->power_savings_balance = 1;
+                sds->min_nr_running = ULONG_MAX;
+                sds->leader_nr_running = 0;
+        }
+}
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *              load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        if (!sds->power_savings_balance)
+                return;
+        /*
+         * If the local group is idle or completely loaded
+         * no need to do power savings balance at this domain
+         */
+        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                                !sds->this_nr_running))
+                sds->power_savings_balance = 0;
+        /*
+         * If a group is already running at full capacity or idle,
+         * don't include that group in power savings calculations
+         */
+        if (!sds->power_savings_balance ||
+                sgs->sum_nr_running >= sgs->group_capacity ||
+                !sgs->sum_nr_running)
+                return;
+        /*
+         * Calculate the group which has the least non-idle load.
+         * This is the group from where we need to pick up the load
+         * for saving power
+         */
+        if ((sgs->sum_nr_running < sds->min_nr_running) ||
+            (sgs->sum_nr_running == sds->min_nr_running &&
+             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+                sds->group_min = group;
+                sds->min_nr_running = sgs->sum_nr_running;
+                sds->min_load_per_task = sgs->sum_weighted_load /
+                                                sgs->sum_nr_running;
+        }
+        /*
+         * Calculate the group which is almost near its
+         * capacity but still has some space to pick up some load
+         * from other group and save more power
+         */
+        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
+                return;
+        if (sgs->sum_nr_running > sds->leader_nr_running ||
+            (sgs->sum_nr_running == sds->leader_nr_running &&
+             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+                sds->group_leader = group;
+                sds->leader_nr_running = sgs->sum_nr_running;
+        }
+}
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *      under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
+        if (!sds->power_savings_balance)
+                return 0;
+        if (sds->this != sds->group_leader ||
+                        sds->group_leader == sds->group_min)
+                return 0;
+        *imbalance = sds->min_load_per_task;
+        sds->busiest = sds->group_min;
+        return 1;
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        return;
+}
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        return;
+}
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
+        return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return SCHED_LOAD_SCALE;
+}
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_freq_power(sd, cpu);
+}
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = sd->span_weight;
+        unsigned long smt_gain = sd->smt_gain;
+        smt_gain /= weight;
+        return smt_gain;
+}
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_smt_power(sd, cpu);
+}
+unsigned long scale_rt_power(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        u64 total, available;
+        sched_avg_update(rq);
+        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        available = total - rq->rt_avg;
+        if (unlikely((s64)total < SCHED_LOAD_SCALE))
+                total = SCHED_LOAD_SCALE;
+        total >>= SCHED_LOAD_SHIFT;
+        return div_u64(available, total);
+}
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = sd->span_weight;
+        unsigned long power = SCHED_LOAD_SCALE;
+        struct sched_group *sdg = sd->groups;
+        if (sched_feat(ARCH_POWER))
+                power *= arch_scale_freq_power(sd, cpu);
+        else
+                power *= default_scale_freq_power(sd, cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+                if (sched_feat(ARCH_POWER))
+                        power *= arch_scale_smt_power(sd, cpu);
+                else
+                        power *= default_scale_smt_power(sd, cpu);
+                power >>= SCHED_LOAD_SHIFT;
+        }
+        power *= scale_rt_power(cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if (!power)
+                power = 1;
+        cpu_rq(cpu)->cpu_power = power;
+        sdg->cpu_power = power;
+}
+static void update_group_power(struct sched_domain *sd, int cpu)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group, *sdg = sd->groups;
+        unsigned long power;
+        if (!child) {
+                update_cpu_power(sd, cpu);
+                return;
+        }
+        power = 0;
+        group = child->groups;
+        do {
+                power += group->cpu_power;
+                group = group->next;
+        } while (group != child->groups);
+        sdg->cpu_power = power;
+}
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+                        struct sched_group *group, int this_cpu,
+                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                        int local_group, const struct cpumask *cpus,
+                        int *balance, struct sg_lb_stats *sgs)
+{
+        unsigned long load, max_cpu_load, min_cpu_load;
+        int i;
+        unsigned int balance_cpu = -1, first_idle_cpu = 0;
+        unsigned long avg_load_per_task = 0;
+        if (local_group)
+                balance_cpu = group_first_cpu(group);
+        /* Tally up the load of all CPUs in the group */
+        max_cpu_load = 0;
+        min_cpu_load = ~0UL;
+        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                struct rq *rq = cpu_rq(i);
+                if (*sd_idle && rq->nr_running)
+                        *sd_idle = 0;
+                /* Bias balancing toward cpus of our domain */
+                if (local_group) {
+                        if (idle_cpu(i) && !first_idle_cpu) {
+                                first_idle_cpu = 1;
+                                balance_cpu = i;
+                        }
+                        load = target_load(i, load_idx);
+                } else {
+                        load = source_load(i, load_idx);
+                        if (load > max_cpu_load)
+                                max_cpu_load = load;
+                        if (min_cpu_load > load)
+                                min_cpu_load = load;
+                }
+                sgs->group_load += load;
+                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_weighted_load += weighted_cpuload(i);
+        }
+        /*
+         * First idle cpu or the first cpu(busiest) in this sched group
+         * is eligible for doing load balancing at this and above
+         * domains. In the newly idle case, we will allow all the cpu's
+         * to do the newly idle load balance.
+         */
+        if (idle != CPU_NEWLY_IDLE && local_group &&
+            balance_cpu != this_cpu) {
+                *balance = 0;
+                return;
+        }
+        update_group_power(sd, this_cpu);
+        /* Adjust by relative CPU power of the group */
+        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+        /*
+         * Consider the group unbalanced when the imbalance is larger
+         * than the average weight of two tasks.
+         *
+         * APZ: with cgroup the avg task weight can vary wildly and
+         *      might not be a suitable number - should we keep a
+         *      normalized nr_running number somewhere that negates
+         *      the hierarchy?
+         */
+        if (sgs->sum_nr_running)
+                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+                sgs->group_imb = 1;
+        sgs->group_capacity =
+                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+}
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                        enum cpu_idle_type idle, int *sd_idle,
+                        const struct cpumask *cpus, int *balance,
+                        struct sd_lb_stats *sds)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group = sd->groups;
+        struct sg_lb_stats sgs;
+        int load_idx, prefer_sibling = 0;
+        if (child && child->flags & SD_PREFER_SIBLING)
+                prefer_sibling = 1;
+        init_sd_power_savings_stats(sd, sds, idle);
+        load_idx = get_sd_load_idx(sd, idle);
+        do {
+                int local_group;
+                local_group = cpumask_test_cpu(this_cpu,
+                                               sched_group_cpus(group));
+                memset(&sgs, 0, sizeof(sgs));
+                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+                                local_group, cpus, balance, &sgs);
+                if (local_group && !(*balance))
+                        return;
+                sds->total_load += sgs.group_load;
+                sds->total_pwr += group->cpu_power;
+                /*
+                 * In case the child domain prefers tasks go to siblings
+                 * first, lower the group capacity to one so that we'll try
+                 * and move all the excess tasks away.
+                 */
+                if (prefer_sibling)
+                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
+                if (local_group) {
+                        sds->this_load = sgs.avg_load;
+                        sds->this = group;
+                        sds->this_nr_running = sgs.sum_nr_running;
+                        sds->this_load_per_task = sgs.sum_weighted_load;
+                } else if (sgs.avg_load > sds->max_load &&
+                           (sgs.sum_nr_running > sgs.group_capacity ||
+                                sgs.group_imb)) {
+                        sds->max_load = sgs.avg_load;
+                        sds->busiest = group;
+                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_group_capacity = sgs.group_capacity;
+                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->group_imb = sgs.group_imb;
+                }
+                update_sd_power_savings_stats(group, sds, local_group, &sgs);
+                group = group->next;
+        } while (group != sd->groups);
+}
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *                      amongst the groups of a sched_domain, during
+ *                      load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                                int this_cpu, unsigned long *imbalance)
+{
+        unsigned long tmp, pwr_now = 0, pwr_move = 0;
+        unsigned int imbn = 2;
+        unsigned long scaled_busy_load_per_task;
+        if (sds->this_nr_running) {
+                sds->this_load_per_task /= sds->this_nr_running;
+                if (sds->busiest_load_per_task >
+                                sds->this_load_per_task)
+                        imbn = 1;
+        } else
+                sds->this_load_per_task =
+                        cpu_avg_load_per_task(this_cpu);
+        scaled_busy_load_per_task = sds->busiest_load_per_task
+                                                 * SCHED_LOAD_SCALE;
+        scaled_busy_load_per_task /= sds->busiest->cpu_power;
+        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+                        (scaled_busy_load_per_task * imbn)) {
+                *imbalance = sds->busiest_load_per_task;
+                return;
+        }
+        /*
+         * OK, we don't have enough imbalance to justify moving tasks,
+         * however we may be able to increase total CPU power used by
+         * moving them.
+         */
+        pwr_now += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load);
+        pwr_now += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load);
+        pwr_now /= SCHED_LOAD_SCALE;
+        /* Amount of load we'd subtract */
+        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                sds->busiest->cpu_power;
+        if (sds->max_load > tmp)
+                pwr_move += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load - tmp);
+        /* Amount of load we'd add */
+        if (sds->max_load * sds->busiest->cpu_power <
+                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+                tmp = (sds->max_load * sds->busiest->cpu_power) /
+                        sds->this->cpu_power;
+        else
+                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                        sds->this->cpu_power;
+        pwr_move += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load + tmp);
+        pwr_move /= SCHED_LOAD_SCALE;
+        /* Move if we gain throughput */
+        if (pwr_move > pwr_now)
+                *imbalance = sds->busiest_load_per_task;
+}
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                       groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+                unsigned long *imbalance)
+{
+        unsigned long max_pull, load_above_capacity = ~0UL;
+        sds->busiest_load_per_task /= sds->busiest_nr_running;
+        if (sds->group_imb) {
+                sds->busiest_load_per_task =
+                        min(sds->busiest_load_per_task, sds->avg_load);
+        }
+        /*
+         * In the presence of smp nice balancing, certain scenarios can have
+         * max load less than avg load(as we skip the groups at or below
+         * its cpu_power, while calculating max_load..)
+         */
+        if (sds->max_load < sds->avg_load) {
+                *imbalance = 0;
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+        }
+        if (!sds->group_imb) {
+                /*
+                 * Don't want to pull so many tasks that a group would go idle.
+                 */
+                load_above_capacity = (sds->busiest_nr_running -
+                                                sds->busiest_group_capacity);
+                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+                load_above_capacity /= sds->busiest->cpu_power;
+        }
+        /*
+         * We're trying to get all the cpus to the average_load, so we don't
+         * want to push ourselves above the average load, nor do we wish to
+         * reduce the max loaded cpu below the average load. At the same time,
+         * we also don't want to reduce the group load below the group capacity
+         * (so that we can implement power-savings policies etc). Thus we look
+         * for the minimum possible imbalance.
+         * Be careful of negative numbers as they'll appear as very large values
+         * with unsigned longs.
+         */
+        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+        /* How much load to actually move to equalise the imbalance */
+        *imbalance = min(max_pull * sds->busiest->cpu_power,
+                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+                        / SCHED_LOAD_SCALE;
+        /*
+         * if *imbalance is less than the average load per runnable task
+         * there is no gaurantee that any tasks will be moved so we'll have
+         * a think about bumping its value to force at least one task to be
+         * moved
+         */
+        if (*imbalance < sds->busiest_load_per_task)
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+}
+/******* find_busiest_group() helpers end here *********************/
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *              be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *      is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:     - the busiest group if imbalance exists.
+ *              - If no imbalance and user has opted for power-savings balance,
+ *                 return the least loaded group whose CPUs can be
+ *                 put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                   unsigned long *imbalance, enum cpu_idle_type idle,
+                   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+        struct sd_lb_stats sds;
+        memset(&sds, 0, sizeof(sds));
+        /*
+         * Compute the various statistics relavent for load balancing at
+         * this level.
+         */
+        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                        balance, &sds);
+        /* Cases where imbalance does not exist from POV of this_cpu */
+        /* 1) this_cpu is not the appropriate cpu to perform load balancing
+         *    at this level.
+         * 2) There is no busy sibling group to pull from.
+         * 3) This group is the busiest group.
+         * 4) This group is more busy than the avg busieness at this
+         *    sched_domain.
+         * 5) The imbalance is within the specified limit.
+         */
+        if (!(*balance))
+                goto ret;
+        if (!sds.busiest || sds.busiest_nr_running == 0)
+                goto out_balanced;
+        if (sds.this_load >= sds.max_load)
+                goto out_balanced;
+        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        if (sds.this_load >= sds.avg_load)
+                goto out_balanced;
+        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                goto out_balanced;
+        /* Looks like there is an imbalance. Compute it */
+        calculate_imbalance(&sds, this_cpu, imbalance);
+        return sds.busiest;
+out_balanced:
+        /*
+         * There is no obvious imbalance. But check if we can do some balancing
+         * to save power.
+         */
+        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+                return sds.busiest;
+ret:
+        *imbalance = 0;
+        return NULL;
+}
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
+                   unsigned long imbalance, const struct cpumask *cpus)
+{
+        struct rq *busiest = NULL, *rq;
+        unsigned long max_load = 0;
+        int i;
+        for_each_cpu(i, sched_group_cpus(group)) {
+                unsigned long power = power_of(i);
+                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                unsigned long wl;
+                if (!cpumask_test_cpu(i, cpus))
+                        continue;
+                rq = cpu_rq(i);
+                wl = weighted_cpuload(i);
+                /*
+                 * When comparing with imbalance, use weighted_cpuload()
+                 * which is not scaled with the cpu power.
+                 */
+                if (capacity && rq->nr_running == 1 && wl > imbalance)
+                        continue;
+                /*
+                 * For the load comparisons with the other cpu's, consider
+                 * the weighted_cpuload() scaled with the cpu power, so that
+                 * the load can be moved away from the cpu that is potentially
+                 * running at a lower capacity.
+                 */
+                wl = (wl * SCHED_LOAD_SCALE) / power;
+                if (wl > max_load) {
+                        max_load = wl;
+                        busiest = rq;
+                }
+        }
+        return busiest;
+}
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL     512
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+{
+        if (idle == CPU_NEWLY_IDLE) {
+                /*
+                 * The only task running in a non-idle cpu can be moved to this
+                 * cpu in an attempt to completely freeup the other CPU
+                 * package.
+                 *
+                 * The package power saving logic comes from
+                 * find_busiest_group(). If there are no imbalance, then
+                 * f_b_g() will return NULL. However when sched_mc={1,2} then
+                 * f_b_g() will select a group from which a running task may be
+                 * pulled to this cpu in order to make the other package idle.
+                 * If there is no opportunity to make a package idle and if
+                 * there are no imbalance, then f_b_g() will return NULL and no
+                 * action will be taken in load_balance_newidle().
+                 *
+                 * Under normal task pull operation due to imbalance, there
+                 * will be more than one task in the source run queue and
+                 * move_tasks() will succeed.  ld_moved will be true and this
+                 * active balance code will not be triggered.
+                 */
+                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                        return 0;
+                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                        return 0;
+        }
+        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+static int active_load_balance_cpu_stop(void *data);
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *balance)
+{
+        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+        struct sched_group *group;
+        unsigned long imbalance;
+        struct rq *busiest;
+        unsigned long flags;
+        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+        cpumask_copy(cpus, cpu_active_mask);
+        /*
+         * When power savings policy is enabled for the parent domain, idle
+         * sibling can pick up load irrespective of busy siblings. In this case,
+         * let the state of idle sibling percolate up as CPU_IDLE, instead of
+         * portraying it as CPU_NOT_IDLE.
+         */
+        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                sd_idle = 1;
+        schedstat_inc(sd, lb_count[idle]);
+redo:
+        update_shares(sd);
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+                                   cpus, balance);
+        if (*balance == 0)
+                goto out_balanced;
+        if (!group) {
+                schedstat_inc(sd, lb_nobusyg[idle]);
+                goto out_balanced;
+        }
+        busiest = find_busiest_queue(group, idle, imbalance, cpus);
+        if (!busiest) {
+                schedstat_inc(sd, lb_nobusyq[idle]);
+                goto out_balanced;
+        }
+        BUG_ON(busiest == this_rq);
+        schedstat_add(sd, lb_imbalance[idle], imbalance);
+        ld_moved = 0;
+        if (busiest->nr_running > 1) {
+                /*
+                 * Attempt to move tasks. If find_busiest_group has found
+                 * an imbalance but busiest->nr_running <= 1, the group is
+                 * still unbalanced. ld_moved simply stays zero, so it is
+                 * correctly treated as an imbalance.
+                 */
+                local_irq_save(flags);
+                double_rq_lock(this_rq, busiest);
+                ld_moved = move_tasks(this_rq, this_cpu, busiest,
+                                      imbalance, sd, idle, &all_pinned);
+                double_rq_unlock(this_rq, busiest);
+                local_irq_restore(flags);
+                /*
+                 * some other cpu did the load balance for us.
+                 */
+                if (ld_moved && this_cpu != smp_processor_id())
+                        resched_cpu(this_cpu);
+                /* All tasks on this runqueue were pinned by CPU affinity */
+                if (unlikely(all_pinned)) {
+                        cpumask_clear_cpu(cpu_of(busiest), cpus);
+                        if (!cpumask_empty(cpus))
+                                goto redo;
+                        goto out_balanced;
+                }
+        }
+        if (!ld_moved) {
+                schedstat_inc(sd, lb_failed[idle]);
+                sd->nr_balance_failed++;
+                if (need_active_balance(sd, sd_idle, idle)) {
+                        raw_spin_lock_irqsave(&busiest->lock, flags);
+                        /* don't kick the active_load_balance_cpu_stop,
+                         * if the curr task on busiest cpu can't be
+                         * moved to this_cpu
+                         */
+                        if (!cpumask_test_cpu(this_cpu,
+                                              &busiest->curr->cpus_allowed)) {
+                                raw_spin_unlock_irqrestore(&busiest->lock,
+                                                            flags);
+                                all_pinned = 1;
+                                goto out_one_pinned;
+                        }
+                        /*
+                         * ->active_balance synchronizes accesses to
+                         * ->active_balance_work.  Once set, it's cleared
+                         * only after active load balance is finished.
+                         */
+                        if (!busiest->active_balance) {
+                                busiest->active_balance = 1;
+                                busiest->push_cpu = this_cpu;
+                                active_balance = 1;
+                        }
+                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
+                        if (active_balance)
+                                stop_one_cpu_nowait(cpu_of(busiest),
+                                        active_load_balance_cpu_stop, busiest,
+                                        &busiest->active_balance_work);
+                        /*
+                         * We've kicked active balancing, reset the failure
+                         * counter.
+                         */
+                        sd->nr_balance_failed = sd->cache_nice_tries+1;
+                }
+        } else
+                sd->nr_balance_failed = 0;
+        if (likely(!active_balance)) {
+                /* We were unbalanced, so reset the balancing interval */
+                sd->balance_interval = sd->min_interval;
+        } else {
+                /*
+                 * If we've begun active balancing, start to back off. This
+                 * case may not be covered by the all_pinned logic if there
+                 * is only 1 task on the busy runqueue (because we don't call
+                 * move_tasks).
+                 */
+                if (sd->balance_interval < sd->max_interval)
+                        sd->balance_interval *= 2;
+        }
+        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        goto out;
+out_balanced:
+        schedstat_inc(sd, lb_balanced[idle]);
+        sd->nr_balance_failed = 0;
+out_one_pinned:
+        /* tune up the balancing interval */
+        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                        (sd->balance_interval < sd->max_interval))
+                sd->balance_interval *= 2;
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        else
+                ld_moved = 0;
+out:
+        if (ld_moved)
+                update_shares(sd);
+        return ld_moved;
+}
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, struct rq *this_rq)
 {
-        struct cfs_rq *busy_cfs_rq;
+        struct sched_domain *sd;
-        struct rq_iterator cfs_rq_iterator;
+        int pulled_task = 0;
+        unsigned long next_balance = jiffies + HZ;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        this_rq->idle_stamp = this_rq->clock;
-        cfs_rq_iterator.next = load_balance_next_fair;
+        if (this_rq->avg_idle < sysctl_sched_migration_cost)
+                return;
+        /*
+         * Drop the rq->lock, but keep IRQ/preempt disabled.
+         */
+        raw_spin_unlock(&this_rq->lock);
-        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+        for_each_domain(this_cpu, sd) {
+                unsigned long interval;
+                int balance = 1;
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                        /* If we've pulled tasks over stop searching: */
+                        pulled_task = load_balance(this_cpu, this_rq,
+                                                   sd, CPU_NEWLY_IDLE, &balance);
+                }
+                interval = msecs_to_jiffies(sd->balance_interval);
+                if (time_after(next_balance, sd->last_balance + interval))
+                        next_balance = sd->last_balance + interval;
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
+                        break;
+                }
+        }
+        raw_spin_lock(&this_rq->lock);
+        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                /*
-                 * pass busy_cfs_rq argument into
+                 * We are going idle. next_balance may be set based on
-                 * load_balance_[start|next]_fair iterators
+                 * a busy processor. So reset next_balance.
                 */
-                cfs_rq_iterator.arg = busy_cfs_rq;
+                this_rq->next_balance = next_balance;
-                if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+        }
-                                       &cfs_rq_iterator))
+}
-                    return 1;
+/*
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
+ */
+static int active_load_balance_cpu_stop(void *data)
+{
+        struct rq *busiest_rq = data;
+        int busiest_cpu = cpu_of(busiest_rq);
+        int target_cpu = busiest_rq->push_cpu;
+        struct rq *target_rq = cpu_rq(target_cpu);
+        struct sched_domain *sd;
+        raw_spin_lock_irq(&busiest_rq->lock);
+        /* make sure the requested cpu hasn't gone down in the meantime */
+        if (unlikely(busiest_cpu != smp_processor_id() ||
+                     !busiest_rq->active_balance))
+                goto out_unlock;
+        /* Is there any task to move? */
+        if (busiest_rq->nr_running <= 1)
+                goto out_unlock;
+        /*
+         * This condition is "impossible", if it occurs
+         * we need to fix it. Originally reported by
+         * Bjorn Helgaas on a 128-cpu setup.
+         */
+        BUG_ON(busiest_rq == target_rq);
+        /* move a task from busiest_rq to target_rq */
+        double_lock_balance(busiest_rq, target_rq);
+        /* Search for an sd spanning us and the target CPU. */
+        for_each_domain(target_cpu, sd) {
+                if ((sd->flags & SD_LOAD_BALANCE) &&
+                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                                break;
+        }
+        if (likely(sd)) {
+                schedstat_inc(sd, alb_count);
+                if (move_one_task(target_rq, target_cpu, busiest_rq,
+                                  sd, CPU_IDLE))
+                        schedstat_inc(sd, alb_pushed);
+                else
+                        schedstat_inc(sd, alb_failed);
+        }
+        double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+        busiest_rq->active_balance = 0;
+        raw_spin_unlock_irq(&busiest_rq->lock);
+        return 0;
+}
+#ifdef CONFIG_NO_HZ
+static struct {
+        atomic_t load_balancer;
+        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
+} nohz ____cacheline_aligned = {
+        .load_balancer = ATOMIC_INIT(-1),
+};
+int get_nohz_load_balancer(void)
+{
+        return atomic_read(&nohz.load_balancer);
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
+/*
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
+ *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+        int cpu = smp_processor_id();
+        if (stop_tick) {
+                cpu_rq(cpu)->in_nohz_recently = 1;
+                if (!cpu_active(cpu)) {
+                        if (atomic_read(&nohz.load_balancer) != cpu)
+                                return 0;
+                        /*
+                         * If we are going offline and still the leader,
+                         * give up!
+                         */
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+                        return 0;
+                }
+                cpumask_set_cpu(cpu, nohz.cpu_mask);
+                /* time for ilb owner also to sleep */
+                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
+                        if (atomic_read(&nohz.load_balancer) == cpu)
+                                atomic_set(&nohz.load_balancer, -1);
+                        return 0;
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        /* make me the ilb owner */
+                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+                                return 1;
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
+                        return 1;
+                }
+        } else {
+                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+                        return 0;
+                cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                if (atomic_read(&nohz.load_balancer) == cpu)
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+        }
        return 0;
 }
+#endif
+static DEFINE_SPINLOCK(balancing);
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+{
+        int balance = 1;
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long interval;
+        struct sched_domain *sd;
+        /* Earliest time when we have to do rebalance again */
+        unsigned long next_balance = jiffies + 60*HZ;
+        int update_next_balance = 0;
+        int need_serialize;
+        for_each_domain(cpu, sd) {
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                interval = sd->balance_interval;
+                if (idle != CPU_IDLE)
+                        interval *= sd->busy_factor;
+                /* scale ms to jiffies */
+                interval = msecs_to_jiffies(interval);
+                if (unlikely(!interval))
+                        interval = 1;
+                if (interval > HZ*NR_CPUS/10)
+                        interval = HZ*NR_CPUS/10;
+                need_serialize = sd->flags & SD_SERIALIZE;
+                if (need_serialize) {
+                        if (!spin_trylock(&balancing))
+                                goto out;
+                }
+                if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                        if (load_balance(cpu, rq, sd, idle, &balance)) {
+                                /*
+                                 * We've pulled tasks over so either we're no
+                                 * longer idle, or one of our SMT siblings is
+                                 * not idle.
+                                 */
+                                idle = CPU_NOT_IDLE;
+                        }
+                        sd->last_balance = jiffies;
+                }
+                if (need_serialize)
+                        spin_unlock(&balancing);
+out:
+                if (time_after(next_balance, sd->last_balance + interval)) {
+                        next_balance = sd->last_balance + interval;
+                        update_next_balance = 1;
+                }
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!balance)
+                        break;
+        }
+        /*
+         * next_balance will be updated only when there is a need.
+         * When the cpu is attached to null domain for ex, it will not be
+         * updated.
+         */
+        if (likely(update_next_balance))
+                rq->next_balance = next_balance;
+}
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+        int this_cpu = smp_processor_id();
+        struct rq *this_rq = cpu_rq(this_cpu);
+        enum cpu_idle_type idle = this_rq->idle_at_tick ?
+                                                CPU_IDLE : CPU_NOT_IDLE;
+        rebalance_domains(this_cpu, idle);
+#ifdef CONFIG_NO_HZ
+        /*
+         * If this cpu is the owner for idle load balancing, then do the
+         * balancing on behalf of the other idle cpus whose ticks are
+         * stopped.
+         */
+        if (this_rq->idle_at_tick &&
+            atomic_read(&nohz.load_balancer) == this_cpu) {
+                struct rq *rq;
+                int balance_cpu;
+                for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                        if (balance_cpu == this_cpu)
+                                continue;
+                        /*
+                         * If this cpu gets work to do, stop the load balancing
+                         * work being done for other cpus. Next load
+                         * balancing owner will pick it up.
+                         */
+                        if (need_resched())
+                                break;
+                        rebalance_domains(balance_cpu, CPU_IDLE);
+                        rq = cpu_rq(balance_cpu);
+                        if (time_after(this_rq->next_balance, rq->next_balance))
+                                this_rq->next_balance = rq->next_balance;
+                }
+        }
+#endif
+}
+static inline int on_null_domain(int cpu)
+{
+        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+}
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(struct rq *rq, int cpu)
+{
+#ifdef CONFIG_NO_HZ
+        /*
+         * If we were in the nohz mode recently and busy at the current
+         * scheduler tick, then check if we need to nominate new idle
+         * load balancer.
+         */
+        if (rq->in_nohz_recently && !rq->idle_at_tick) {
+                rq->in_nohz_recently = 0;
+                if (atomic_read(&nohz.load_balancer) == cpu) {
+                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                        atomic_set(&nohz.load_balancer, -1);
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        int ilb = find_new_ilb(cpu);
+                        if (ilb < nr_cpu_ids)
+                                resched_cpu(ilb);
+                }
+        }
+        /*
+         * If this cpu is idle and doing idle load balancing for all the
+         * cpus with ticks stopped, is it time for that to stop?
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+                resched_cpu(cpu);
+                return;
+        }
+        /*
+         * If this cpu is idle and the idle load balancing is done by
+         * someone else, then no need raise the SCHED_SOFTIRQ
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+            cpumask_test_cpu(cpu, nohz.cpu_mask))
+                return;
+#endif
+        /* Don't need to rebalance while attached to NULL domain */
+        if (time_after_eq(jiffies, rq->next_balance) &&
+            likely(!on_null_domain(cpu)))
+                raise_softirq(SCHED_SOFTIRQ);
+}
 static void rq_online_fair(struct rq *rq)
 {
@@ -1962,6 +3502,15 @@ static void rq_offline_fair(struct rq *rq)
        update_sysctl();
 }
+#else   /* CONFIG_SMP */
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 /*
@@ -2076,7 +3625,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
 }
 #endif
-unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
        unsigned int rr_interval = 0;
@@ -2108,8 +3657,6 @@ static const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
-        .load_balance           = load_balance_fair,
-        .move_one_task          = move_one_task_fair,
        .rq_online              = rq_online_fair,
        .rq_offline             = rq_offline_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
 /*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-/*
 * Only give sleepers 50% of their service deficit. This allows
 * them to run sooner, but does not allow tons of sleepers to
 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 /*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-/*
 * Place new tasks ahead so that they do not starve already running
 * tasks
 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 /*
- * Compute wakeup_gran based on task behaviour, clipped to
- *  [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-/*
 * Based on load and program behaviour, see if it makes sense to place
 * a newly woken task on the same cpu as the task that woke it --
 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 /*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-/*
 * Prefer to schedule the task we woke last (assuming it failed
 * wakeup-preemption), since its likely going to consume data we
 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
 */
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        /* adjust the active tasks as we might go into a long sleep */
+        calc_load_account_idle(rq);
-        calc_load_account_active(rq);
        return rq->idle;
 }
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
 * message if some code attempts to do it:
 */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
        raw_spin_unlock_irq(&rq->lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
-#ifdef CONFIG_SMP
-static unsigned long
-load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
-{
-        return 0;
-}
-static int
-move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        return 0;
-}
-#endif
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                check_preempt_curr(rq, p, 0);
 }
-unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
        return 0;
 }
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
-        .load_balance           = load_balance_idle,
-        .move_one_task          = move_one_task_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return rt_se->my_q;
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        int this_cpu = smp_processor_id();
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
-                        enqueue_rt_entity(rt_se);
+                        enqueue_rt_entity(rt_se, false);
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        int this_cpu = smp_processor_id();
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_se && on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
@@ -607,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
-        schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+        schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        dec_rt_group(rt_se, rt_rq);
 }
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        list_add_tail(&rt_se->run_list, queue);
+        if (head)
+                list_add(&rt_se->run_list, queue);
+        else
+                list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
        }
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
-                __enqueue_rt_entity(rt_se);
+                __enqueue_rt_entity(rt_se, head);
 }
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,27 +880,28 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                struct rt_rq *rt_rq = group_rt_rq(rt_se);
                if (rt_rq && rt_rq->rt_nr_running)
-                        __enqueue_rt_entity(rt_se);
+                        __enqueue_rt_entity(rt_se, false);
        }
 }
 /*
 * Adding/removing a task to/from a priority array:
 */
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
-        if (wakeup)
+        if (flags & ENQUEUE_WAKEUP)
                rt_se->timeout = 0;
-        enqueue_rt_entity(rt_se);
+        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
@@ -938,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
-        struct rq *rq = task_rq(p);
        if (sd_flag != SD_BALANCE_WAKE)
                return smp_processor_id();
@@ -1136,7 +1145,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
                if (next && next->prio < idx)
                        continue;
                list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                        struct task_struct *p = rt_task_of(rt_se);
+                        struct task_struct *p;
+                        if (!rt_entity_is_task(rt_se))
+                                continue;
+                        p = rt_task_of(rt_se);
                        if (pick_rt_task(rq, p, cpu)) {
                                next = p;
                                break;
@@ -1481,24 +1495,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
                push_rt_tasks(rq);
 }
-static unsigned long
-load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move,
-                struct sched_domain *sd, enum cpu_idle_type idle,
-                int *all_pinned, int *this_best_prio)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
-static int
-move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                 struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
 static void set_cpus_allowed_rt(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
@@ -1670,8 +1666,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
        if (!p->signal)
                return;
-        soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+        /* max may change after cur was read, this will be fixed next tick */
-        hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+        soft = task_rlimit(p, RLIMIT_RTTIME);
+        hard = task_rlimit_max(p, RLIMIT_RTTIME);
        if (soft != RLIM_INFINITY) {
                unsigned long next;
@@ -1721,7 +1718,7 @@ static void set_curr_task_rt(struct rq *rq)
        dequeue_pushable_task(rq, p);
 }
-unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
        /*
         * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1743,6 @@ static const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_rt,
-        .load_balance           = load_balance_rt,
-        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..906ae5a1779c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
 /* Given the mask, find the first available signal that should be serviced. */
+#define SYNCHRONOUS_MASK \
+        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
+         sigmask(SIGTRAP) | sigmask(SIGFPE))
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
        unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
        s = pending->signal.sig;
        m = mask->sig;
+        /*
+         * Handle the first word specially: it contains the
+         * synchronous signals that need to be dequeued first.
+         */
+        x = *s &~ *m;
+        if (x) {
+                if (x & SYNCHRONOUS_MASK)
+                        x &= SYNCHRONOUS_MASK;
+                sig = ffz(~x) + 1;
+                return sig;
+        }
        switch (_NSIG_WORDS) {
        default:
-                for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
+                for (i = 1; i < _NSIG_WORDS; ++i) {
-                        if ((x = *s &~ *m) != 0) {
+                        x = *++s &~ *++m;
-                                sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        if (!x)
-                                break;
+                                continue;
-                        }
+                        sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        break;
+                }
                break;
-        case 2: if ((x = s[0] &~ m[0]) != 0)
+        case 2:
-                        sig = 1;
+                x = s[1] &~ m[1];
-                else if ((x = s[1] &~ m[1]) != 0)
+                if (!x)
-                        sig = _NSIG_BPW + 1;
-                else
                        break;
-                sig += ffz(~x);
+                sig = ffz(~x) + _NSIG_BPW + 1;
                break;
-        case 1: if ((x = *s &~ *m) != 0)
+        case 1:
-                        sig = ffz(~x) + 1;
+                /* Nothing to do */
                break;
        }
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
        if (override_rlimit ||
            atomic_read(&user->sigpending) <=
-                        t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
+                        task_rlimit(t, RLIMIT_SIGPENDING)) {
                q = kmem_cache_alloc(sigqueue_cachep, flags);
        } else {
                print_dropped_signal(sig);
@@ -625,7 +642,7 @@ static inline bool si_fromuser(const struct siginfo *info)
 static int check_kill_permission(int sig, struct siginfo *info,
                                 struct task_struct *t)
 {
-        const struct cred *cred = current_cred(), *tcred;
+        const struct cred *cred, *tcred;
        struct pid *sid;
        int error;
@@ -639,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
        if (error)
                return error;
+        cred = current_cred();
        tcred = __task_cred(t);
-        if ((cred->euid ^ tcred->suid) &&
+        if (!same_thread_group(current, t) &&
+            (cred->euid ^ tcred->suid) &&
            (cred->euid ^ tcred->uid) &&
            (cred->uid  ^ tcred->suid) &&
            (cred->uid  ^ tcred->uid) &&
@@ -1066,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 /*
 * Nuke all other threads in the group.
 */
-void zap_other_threads(struct task_struct *p)
+int zap_other_threads(struct task_struct *p)
 {
-        struct task_struct *t;
+        struct task_struct *t = p;
+        int count = 0;
        p->signal->group_stop_count = 0;
-        for (t = next_thread(p); t != p; t = next_thread(t)) {
+        while_each_thread(p, t) {
-                /*
+                count++;
-                 * Don't bother with already dead threads
-                 */
+                /* Don't bother with already dead threads */
                if (t->exit_state)
                        continue;
-                /* SIGKILL will be handled before any pending SIGSTOP */
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
        }
+        return count;
 }
 struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -2718,3 +2738,43 @@ void __init signals_init(void)
 {
        sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
 }
+#ifdef CONFIG_KGDB_KDB
+#include <linux/kdb.h>
+/*
+ * kdb_send_sig_info - Allows kdb to send signals without exposing
+ * signal internals.  This function checks if the required locks are
+ * available before calling the main signal code, to avoid kdb
+ * deadlocks.
+ */
+void
+kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
+{
+        static struct task_struct *kdb_prev_t;
+        int sig, new_t;
+        if (!spin_trylock(&t->sighand->siglock)) {
+                kdb_printf("Can't do kill command now.\n"
+                           "The sigmask lock is held somewhere else in "
+                           "kernel, try again later\n");
+                return;
+        }
+        spin_unlock(&t->sighand->siglock);
+        new_t = kdb_prev_t != t;
+        kdb_prev_t = t;
+        if (t->state != TASK_RUNNING && new_t) {
+                kdb_printf("Process is not RUNNING, sending a signal from "
+                           "kdb risks deadlock\n"
+                           "on the run queue locks. "
+                           "The signal has _not_ been sent.\n"
+                           "Reissue the kill command if you want to risk "
+                           "the deadlock.\n");
+                return;
+        }
+        sig = info->si_signo;
+        if (send_sig_info(sig, info, t))
+                kdb_printf("Fail to deliver Signal %d to process %d.\n",
+                           sig, t->pid);
+        else
+                kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
+}
+#endif  /* CONFIG_KGDB_KDB */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
                        goto cancelled;
                /* the timer holds a reference whilst it is pending */
-                ret = work->ops->get_ref(work);
+                ret = slow_work_get_ref(work);
                if (ret < 0)
                        goto cant_get_ref;
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
 */
 static inline void slow_work_set_thread_pid(int id, pid_t pid)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_pids[id] = pid;
 #endif
 }
 static inline void slow_work_mark_time(struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        work->mark = CURRENT_TIME;
 #endif
 }
 static inline void slow_work_begin_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_execs[id] = work;
 #endif
 }
 static inline void slow_work_end_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        write_lock(&slow_work_execs_lock);
        slow_work_execs[id] = NULL;
        write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..75c970c715d3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,11 +9,10 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
-static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
        struct list_head        queue;
        raw_spinlock_t          lock;
@@ -33,12 +32,14 @@ struct call_function_data {
        cpumask_var_t           cpumask;
 };
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 struct call_single_queue {
        struct list_head        list;
        raw_spinlock_t          lock;
 };
-static DEFINE_PER_CPU(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -51,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_UP_PREPARE_FROZEN:
                if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                cpu_to_node(cpu)))
-                        return NOTIFY_BAD;
+                        return notifier_from_errno(-ENOMEM);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -256,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void)
        }
 }
-static DEFINE_PER_CPU(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 /*
 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
 */
 /*
- * The trampoline is called when the hrtimer expires. If this is
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * called from the hrtimer interrupt then we schedule the tasklet as
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * the timer callback function expects to run in softirq context. If
+ * hrtimer callback, but from softirq context.
- * it's called in softirq context anyway (i.e. high resolution timers
- * disabled) then the hrtimer callback is called right away.
 */
 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
 {
        struct tasklet_hrtimer *ttimer =
                container_of(timer, struct tasklet_hrtimer, timer);
-        if (hrtimer_is_hres_active(timer)) {
+        tasklet_hi_schedule(&ttimer->tasklet);
-                tasklet_hi_schedule(&ttimer->tasklet);
+        return HRTIMER_NORESTART;
-                return HRTIMER_NORESTART;
-        }
-        return ttimer->function(timer);
 }
 /*
@@ -721,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
                        preempt_enable_no_resched();
                        cond_resched();
                        preempt_disable();
-                        rcu_sched_qs((long)__bind_cpu);
+                        rcu_note_context_switch((long)__bind_cpu);
                }
                preempt_enable();
                set_current_state(TASK_INTERRUPTIBLE);
@@ -813,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
-                        return NOTIFY_BAD;
+                        return notifier_from_errno(PTR_ERR(p));
                }
                kthread_bind(p, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = p;
@@ -855,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
        void *cpu = (void *)(long)smp_processor_id();
        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-        BUG_ON(err == NOTIFY_BAD);
+        BUG_ON(err != NOTIFY_OK);
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
        return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
 static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
 static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(bool, softlock_touch_sync);
 static int __read_mostly did_panic;
 int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_softlockup_watchdog_sync(void)
+{
+        __raw_get_cpu_var(softlock_touch_sync) = true;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
+}
 void touch_all_softlockup_watchdogs(void)
 {
        int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
        }
        if (touch_ts == 0) {
+                if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
+                        /*
+                         * If the time stamp was touched atomically
+                         * make sure the scheduler tick is up to date.
+                         */
+                        per_cpu(softlock_touch_sync, this_cpu) = false;
+                        sched_clock_tick();
+                }
                __touch_softlockup_watchdog();
                return;
        }
@@ -140,11 +155,11 @@ void softlockup_tick(void)
         * Wake up the high-prio watchdog task twice per
         * threshold timespan.
         */
-        if (now > touch_ts + softlockup_thresh/2)
+        if (time_after(now - softlockup_thresh/2, touch_ts))
                wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
        /* Warn about unreasonable delays: */
-        if (now <= (touch_ts + softlockup_thresh))
+        if (time_before_eq(now - softlockup_thresh, touch_ts))
                return;
        per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,10 +30,33 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/srcu.h>
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+        sp->completed = 0;
+        mutex_init(&sp->mutex);
+        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+        return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+                       struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /* Don't re-initialize a lock while it is held. */
+        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+        lockdep_init_map(&sp->dep_map, name, key, 0);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+        return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /**
 * init_srcu_struct - initialize a sleep-RCU structure
 * @sp: structure to initialize.
@@ -44,13 +67,12 @@
 */
 int init_srcu_struct(struct srcu_struct *sp)
 {
-        sp->completed = 0;
+        return init_srcu_struct_fields(sp);
-        mutex_init(&sp->mutex);
-        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
-        return (sp->per_cpu_ref ? 0 : -ENOMEM);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
 * srcu_readers_active_idx -- returns approximate number of readers
 *      active on the specified rank of per-CPU counters.
@@ -100,15 +122,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-/**
+/*
- * srcu_read_lock - register a new reader for an SRCU-protected structure.
- * @sp: srcu_struct in which to register the new reader.
- *
 * Counts the new reader in the appropriate per-CPU element of the
 * srcu_struct.  Must be called from process context.
 * Returns an index that must be passed to the matching srcu_read_unlock().
 */
-int srcu_read_lock(struct srcu_struct *sp)
+int __srcu_read_lock(struct srcu_struct *sp)
 {
        int idx;
@@ -120,31 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp)
        preempt_enable();
        return idx;
 }
-EXPORT_SYMBOL_GPL(srcu_read_lock);
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
-/**
+/*
- * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
- * @sp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock().
- *
 * Removes the count for the old reader from the appropriate per-CPU
 * element of the srcu_struct.  Note that this may well be a different
 * CPU than that which was incremented by the corresponding srcu_read_lock().
 * Must be called from process context.
 */
-void srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
        preempt_disable();
        srcu_barrier();  /* ensure compiler won't misorder critical section. */
        per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
        preempt_enable();
 }
-EXPORT_SYMBOL_GPL(srcu_read_unlock);
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 /*
 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
 */
-void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
        int idx;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..70f8d90331e9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/*
- * GPL v2 and any later version.
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005     IBM Corporation.
+ * Copyright (C) 2008, 2005     Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010           SUSE Linux Products GmbH
+ * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
 */
+#include <linux/completion.h>
 #include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 #include <asm/atomic.h>
-#include <asm/uaccess.h>
+/*
+ * Structure to determine completion condition and record errors.  May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+        atomic_t                nr_todo;        /* nr left to execute */
+        bool                    executed;       /* actually executed? */
+        int                     ret;            /* collected return value */
+        struct completion       completion;     /* fired if nr_todo reaches 0 */
+};
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+        spinlock_t              lock;
+        struct list_head        works;          /* list of pending works */
+        struct task_struct      *thread;        /* stopper thread */
+        bool                    enabled;        /* is this stopper enabled? */
+};
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+        memset(done, 0, sizeof(*done));
+        atomic_set(&done->nr_todo, nr_todo);
+        init_completion(&done->completion);
+}
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+        if (done) {
+                if (executed)
+                        done->executed = true;
+                if (atomic_dec_and_test(&done->nr_todo))
+                        complete(&done->completion);
+        }
+}
+/* queue @work to @stopper.  if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+                                struct cpu_stop_work *work)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&stopper->lock, flags);
+        if (stopper->enabled) {
+                list_add_tail(&work->list, &stopper->works);
+                wake_up_process(stopper->thread);
+        } else
+                cpu_stop_signal_done(work->done, false);
+        spin_unlock_irqrestore(&stopper->lock, flags);
+}
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it.  This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes.  If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus.  @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_done done;
+        struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+        cpu_stop_init_done(&done, 1);
+        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+        wait_for_completion(&done.completion);
+        return done.executed ? done.ret : -ENOENT;
+}
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion.  The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+                        struct cpu_stop_work *work_buf)
+{
+        *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_work *work;
+        struct cpu_stop_done done;
+        unsigned int cpu;
+        /* initialize works and done */
+        for_each_cpu(cpu, cpumask) {
+                work = &per_cpu(stop_cpus_work, cpu);
+                work->fn = fn;
+                work->arg = arg;
+                work->done = &done;
+        }
+        cpu_stop_init_done(&done, cpumask_weight(cpumask));
+        /*
+         * Disable preemption while queueing to avoid getting
+         * preempted by a stopper which might wait for other stoppers
+         * to enter @fn which can lead to deadlock.
+         */
+        preempt_disable();
+        for_each_cpu(cpu, cpumask)
+                cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+                                    &per_cpu(stop_cpus_work, cpu));
+        preempt_enable();
+        wait_for_completion(&done.completion);
+        return done.executed ? done.ret : -ENOENT;
+}
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it.  This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes.  If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus.  @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+        int ret;
+        /* static works are used, process one request at a time */
+        mutex_lock(&stop_cpus_mutex);
+        ret = __stop_cpus(cpumask, fn, arg);
+        mutex_unlock(&stop_cpus_mutex);
+        return ret;
+}
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+        int ret;
+        /* static works are used, process one request at a time */
+        if (!mutex_trylock(&stop_cpus_mutex))
+                return -EAGAIN;
+        ret = __stop_cpus(cpumask, fn, arg);
+        mutex_unlock(&stop_cpus_mutex);
+        return ret;
+}
+static int cpu_stopper_thread(void *data)
+{
+        struct cpu_stopper *stopper = data;
+        struct cpu_stop_work *work;
+        int ret;
+repeat:
+        set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
+        if (kthread_should_stop()) {
+                __set_current_state(TASK_RUNNING);
+                return 0;
+        }
+        work = NULL;
+        spin_lock_irq(&stopper->lock);
+        if (!list_empty(&stopper->works)) {
+                work = list_first_entry(&stopper->works,
+                                        struct cpu_stop_work, list);
+                list_del_init(&work->list);
+        }
+        spin_unlock_irq(&stopper->lock);
+        if (work) {
+                cpu_stop_fn_t fn = work->fn;
+                void *arg = work->arg;
+                struct cpu_stop_done *done = work->done;
+                char ksym_buf[KSYM_NAME_LEN];
+                __set_current_state(TASK_RUNNING);
+                /* cpu stop callbacks are not allowed to sleep */
+                preempt_disable();
+                ret = fn(arg);
+                if (ret)
+                        done->ret = ret;
+                /* restore preemption and check it's still balanced */
+                preempt_enable();
+                WARN_ONCE(preempt_count(),
+                          "cpu_stop: %s(%p) leaked preempt count\n",
+                          kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+                                          ksym_buf), arg);
+                cpu_stop_signal_done(done, true);
+        } else
+                schedule();
+        goto repeat;
+}
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+                                           unsigned long action, void *hcpu)
+{
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+        unsigned int cpu = (unsigned long)hcpu;
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+        struct task_struct *p;
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_UP_PREPARE:
+                BUG_ON(stopper->thread || stopper->enabled ||
+                       !list_empty(&stopper->works));
+                p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+                                   cpu);
+                if (IS_ERR(p))
+                        return NOTIFY_BAD;
+                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+                get_task_struct(p);
+                stopper->thread = p;
+                break;
+        case CPU_ONLINE:
+                kthread_bind(stopper->thread, cpu);
+                /* strictly unnecessary, as first user will wake it */
+                wake_up_process(stopper->thread);
+                /* mark enabled */
+                spin_lock_irq(&stopper->lock);
+                stopper->enabled = true;
+                spin_unlock_irq(&stopper->lock);
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+        case CPU_POST_DEAD:
+        {
+                struct cpu_stop_work *work;
+                /* kill the stopper */
+                kthread_stop(stopper->thread);
+                /* drain remaining works */
+                spin_lock_irq(&stopper->lock);
+                list_for_each_entry(work, &stopper->works, list)
+                        cpu_stop_signal_done(work->done, false);
+                stopper->enabled = false;
+                spin_unlock_irq(&stopper->lock);
+                /* release the stopper */
+                put_task_struct(stopper->thread);
+                stopper->thread = NULL;
+                break;
+        }
+#endif
+        }
+        return NOTIFY_OK;
+}
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers.  It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+        .notifier_call  = cpu_stop_cpu_callback,
+        .priority       = 10,
+};
+static int __init cpu_stop_init(void)
+{
+        void *bcpu = (void *)(long)smp_processor_id();
+        unsigned int cpu;
+        int err;
+        for_each_possible_cpu(cpu) {
+                struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+                spin_lock_init(&stopper->lock);
+                INIT_LIST_HEAD(&stopper->works);
+        }
+        /* start one for the boot cpu */
+        err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+                                    bcpu);
+        BUG_ON(err == NOTIFY_BAD);
+        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+        register_cpu_notifier(&cpu_stop_cpu_notifier);
+        return 0;
+}
+early_initcall(cpu_stop_init);
+#ifdef CONFIG_STOP_MACHINE
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
        /* Exit */
        STOPMACHINE_EXIT,
 };
-static enum stopmachine_state state;
 struct stop_machine_data {
-        int (*fn)(void *);
+        int                     (*fn)(void *);
-        void *data;
+        void                    *data;
-        int fnret;
+        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+        unsigned int            num_threads;
+        const struct cpumask    *active_cpus;
+        enum stopmachine_state  state;
+        atomic_t                thread_ack;
 };
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static void set_state(struct stop_machine_data *smdata,
-static unsigned int num_threads;
+                      enum stopmachine_state newstate)
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void *stop_machine_work;
-static void set_state(enum stopmachine_state newstate)
 {
        /* Reset ack counter. */
-        atomic_set(&thread_ack, num_threads);
+        atomic_set(&smdata->thread_ack, smdata->num_threads);
        smp_wmb();
-        state = newstate;
+        smdata->state = newstate;
 }
 /* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
 {
-        if (atomic_dec_and_test(&thread_ack))
+        if (atomic_dec_and_test(&smdata->thread_ack))
-                set_state(state + 1);
+                set_state(smdata, smdata->state + 1);
 }
-/* This is the actual function which stops the CPU. It runs
+/* This is the cpu_stop function which stops the CPU. */
- * in the context of a dedicated stopmachine workqueue. */
+static int stop_machine_cpu_stop(void *data)
-static void stop_cpu(struct work_struct *unused)
 {
+        struct stop_machine_data *smdata = data;
        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        struct stop_machine_data *smdata = &idle;
+        int cpu = smp_processor_id(), err = 0;
-        int cpu = smp_processor_id();
+        bool is_active;
-        int err;
+        if (!smdata->active_cpus)
+                is_active = cpu == cpumask_first(cpu_online_mask);
+        else
+                is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
-        if (!active_cpus) {
-                if (cpu == cpumask_first(cpu_online_mask))
-                        smdata = &active;
-        } else {
-                if (cpumask_test_cpu(cpu, active_cpus))
-                        smdata = &active;
-        }
        /* Simple state machine */
        do {
                /* Chill out and ensure we re-read stopmachine_state. */
                cpu_relax();
-                if (state != curstate) {
+                if (smdata->state != curstate) {
-                        curstate = state;
+                        curstate = smdata->state;
                        switch (curstate) {
                        case STOPMACHINE_DISABLE_IRQ:
                                local_irq_disable();
                                hard_irq_disable();
                                break;
                        case STOPMACHINE_RUN:
-                                /* On multiple CPUs only a single error code
+                                if (is_active)
-                                 * is needed to tell that something failed. */
+                                        err = smdata->fn(smdata->data);
-                                err = smdata->fn(smdata->data);
-                                if (err)
-                                        smdata->fnret = err;
                                break;
                        default:
                                break;
                        }
-                        ack_state();
+                        ack_state(smdata);
                }
        } while (curstate != STOPMACHINE_EXIT);
        local_irq_enable();
+        return err;
 }
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
-        return 0;
-}
-int stop_machine_create(void)
-{
-        mutex_lock(&setup_lock);
-        if (refcount)
-                goto done;
-        stop_machine_wq = create_rt_workqueue("kstop");
-        if (!stop_machine_wq)
-                goto err_out;
-        stop_machine_work = alloc_percpu(struct work_struct);
-        if (!stop_machine_work)
-                goto err_out;
-done:
-        refcount++;
-        mutex_unlock(&setup_lock);
-        return 0;
-err_out:
-        if (stop_machine_wq)
-                destroy_workqueue(stop_machine_wq);
-        mutex_unlock(&setup_lock);
-        return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-void stop_machine_destroy(void)
-{
-        mutex_lock(&setup_lock);
-        refcount--;
-        if (refcount)
-                goto done;
-        destroy_workqueue(stop_machine_wq);
-        free_percpu(stop_machine_work);
-done:
-        mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-        struct work_struct *sm_work;
+        struct stop_machine_data smdata = { .fn = fn, .data = data,
-        int i, ret;
+                                            .num_threads = num_online_cpus(),
+                                            .active_cpus = cpus };
-        /* Set up initial state. */
-        mutex_lock(&lock);
+        /* Set the initial state and stop all online cpus. */
-        num_threads = num_online_cpus();
+        set_state(&smdata, STOPMACHINE_PREPARE);
-        active_cpus = cpus;
+        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
-        active.fn = fn;
-        active.data = data;
-        active.fnret = 0;
-        idle.fn = chill;
-        idle.data = NULL;
-        set_state(STOPMACHINE_PREPARE);
-        /* Schedule the stop_cpu work on all cpus: hold this CPU so one
-         * doesn't hit this CPU until we're ready. */
-        get_cpu();
-        for_each_online_cpu(i) {
-                sm_work = per_cpu_ptr(stop_machine_work, i);
-                INIT_WORK(sm_work, stop_cpu);
-                queue_work_on(i, stop_machine_wq, sm_work);
-        }
-        /* This will release the thread on our CPU. */
-        put_cpu();
-        flush_workqueue(stop_machine_wq);
-        ret = active.fnret;
-        mutex_unlock(&lock);
-        return ret;
 }
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
        int ret;
-        ret = stop_machine_create();
-        if (ret)
-                return ret;
        /* No CPUs can come up or down during this. */
        get_online_cpus();
        ret = __stop_machine(fn, data, cpus);
        put_online_cpus();
-        stop_machine_destroy();
        return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+#endif  /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..e83ddbbaf89d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,8 +33,10 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
+#include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/gfp.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -222,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;
+        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
                case PRIO_PROCESS:
@@ -267,6 +270,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        }
 out_unlock:
        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -488,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
                return -ENOMEM;
        old = current_cred();
-        retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
-        if (retval)
-                goto error;
        retval = -EPERM;
        if (rgid != (gid_t) -1) {
                if (old->gid == rgid ||
@@ -539,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
                return -ENOMEM;
        old = current_cred();
-        retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
-        if (retval)
-                goto error;
        retval = -EPERM;
        if (capable(CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -569,13 +565,7 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
-        if (!task_can_switch_user(new_user, current)) {
+        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                free_uid(new_user);
-                return -EINVAL;
-        }
-        if (atomic_read(&new_user->processes) >=
-                                current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
                        new_user != INIT_USER) {
                free_uid(new_user);
                return -EAGAIN;
@@ -612,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                return -ENOMEM;
        old = current_cred();
-        retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
-        if (retval)
-                goto error;
        retval = -EPERM;
        if (ruid != (uid_t) -1) {
                new->uid = ruid;
@@ -677,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
                return -ENOMEM;
        old = current_cred();
-        retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
-        if (retval)
-                goto error;
        retval = -EPERM;
        if (capable(CAP_SETUID)) {
                new->suid = new->uid = uid;
@@ -721,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        if (!new)
                return -ENOMEM;
-        retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
-        if (retval)
-                goto error;
        old = current_cred();
        retval = -EPERM;
@@ -790,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
                return -ENOMEM;
        old = current_cred();
-        retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
-        if (retval)
-                goto error;
        retval = -EPERM;
        if (!capable(CAP_SETGID)) {
                if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -853,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
        old = current_cred();
        old_fsuid = old->fsuid;
-        if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
-                goto error;
        if (uid == old->uid  || uid == old->euid  ||
            uid == old->suid || uid == old->fsuid ||
            capable(CAP_SETUID)) {
@@ -866,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
                }
        }
-error:
        abort_creds(new);
        return old_fsuid;
@@ -890,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
        old = current_cred();
        old_fsgid = old->fsgid;
-        if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
-                goto error;
        if (gid == old->gid  || gid == old->egid  ||
            gid == old->sgid || gid == old->fsgid ||
            capable(CAP_SETGID)) {
@@ -902,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
                }
        }
-error:
        abort_creds(new);
        return old_fsgid;
@@ -1118,6 +1085,15 @@ out:
 DECLARE_RWSEM(uts_sem);
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+        (personality(current->personality) == PER_LINUX32 && \
+         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+                      sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name)     0
+#endif
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1126,9 +1102,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
        if (copy_to_user(name, utsname(), sizeof *name))
                errno = -EFAULT;
        up_read(&uts_sem);
+        if (!errno && override_architecture(name))
+                errno = -EFAULT;
        return errno;
 }
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+        int error = 0;
+        if (!name)
+                return -EFAULT;
+        down_read(&uts_sem);
+        if (copy_to_user(name, utsname(), sizeof(*name)))
+                error = -EFAULT;
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error;
+}
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+        int error;
+        if (!name)
+                return -EFAULT;
+        if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+                return -EFAULT;
+        down_read(&uts_sem);
+        error = __copy_to_user(&name->sysname, &utsname()->sysname,
+                               __OLD_UTS_LEN);
+        error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->release, &utsname()->release,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->release + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->version, &utsname()->version,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->version + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->machine, &utsname()->machine,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error ? -EFAULT : 0;
+}
+#endif
 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 {
        int errno;
@@ -1599,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static void argv_cleanup(char **argv, char **envp)
+static void argv_cleanup(struct subprocess_info *info)
 {
-        argv_free(argv);
+        argv_free(info->argv);
 }
 /**
@@ -1635,7 +1668,7 @@ int orderly_poweroff(bool force)
                goto out;
        }
-        call_usermodehelper_setcleanup(info, argv_cleanup);
+        call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
        ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -36,6 +37,7 @@
 #include <linux/highuid.h>
 #include <linux/writeback.h>
 #include <linux/ratelimit.h>
+#include <linux/compaction.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
 #include <linux/key.h>
@@ -50,6 +52,8 @@
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
+#include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -59,13 +63,23 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
-extern int C_A_D;
-extern int print_fatal_signals;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
@@ -87,9 +101,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-extern int rcutorture_runnable;
-#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 #ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
 #endif
@@ -119,14 +130,6 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_MODULES
-extern char modprobe_path[];
-extern int modules_disabled;
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-extern int sg_big_buff;
-#endif
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -148,10 +151,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-extern int acct_parm[];
-#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -159,10 +158,6 @@ extern int unaligned_dump_stack;
 extern struct ratelimit_state printk_ratelimit_state;
-#ifdef CONFIG_RT_MUTEXES
-extern int max_lock_depth;
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -170,6 +165,27 @@ static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+#ifdef CONFIG_MAGIC_SYSRQ
+static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+static int sysrq_sysctl_handler(ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos)
+{
+        int error;
+        error = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (error)
+                return error;
+        if (write)
+                sysrq_toggle_support(__sysrq_enabled);
+        return 0;
+}
+#endif
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
@@ -201,9 +217,6 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
-extern int prove_locking;
-extern int lock_stat;
 /* The default sysctl tables: */
 static struct ctl_table root_table[] = {
@@ -250,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
 static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
+#ifdef CONFIG_COMPACTION
+static int min_extfrag_threshold;
+static int max_extfrag_threshold = 1000;
+#endif
 static struct ctl_table kern_table[] = {
        {
                .procname       = "sched_child_runs_first",
@@ -577,7 +595,7 @@ static struct ctl_table kern_table[] = {
                .data           = &__sysrq_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = sysrq_sysctl_handler,
        },
 #endif
 #ifdef CONFIG_PROC_SYSCTL
@@ -631,7 +649,7 @@ static struct ctl_table kern_table[] = {
 #endif
        {
                .procname       = "userprocess_debug",
-                .data           = &sysctl_userprocess_debug,
+                .data           = &show_unhandled_signals,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
@@ -1109,6 +1127,25 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = drop_caches_sysctl_handler,
        },
+#ifdef CONFIG_COMPACTION
+        {
+                .procname       = "compact_memory",
+                .data           = &sysctl_compact_memory,
+                .maxlen         = sizeof(int),
+                .mode           = 0200,
+                .proc_handler   = sysctl_compaction_handler,
+        },
+        {
+                .procname       = "extfrag_threshold",
+                .data           = &sysctl_extfrag_threshold,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = sysctl_extfrag_handler,
+                .extra1         = &min_extfrag_threshold,
+                .extra2         = &max_extfrag_threshold,
+        },
+#endif /* CONFIG_COMPACTION */
        {
                .procname       = "min_free_kbytes",
                .data           = &min_free_kbytes,
@@ -1433,6 +1470,14 @@ static struct ctl_table fs_table[] = {
                .child          = binfmt_misc_table,
        },
 #endif
+        {
+                .procname       = "pipe-max-size",
+                .data           = &pipe_max_size,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &pipe_proc_fn,
+                .extra1         = &pipe_min_size,
+        },
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1441,7 +1486,8 @@ static struct ctl_table fs_table[] = {
 };
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
+    defined(CONFIG_S390)
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@ -1450,6 +1496,17 @@ static struct ctl_table debug_table[] = {
                .proc_handler   = proc_dointvec
        },
 #endif
+#if defined(CONFIG_OPTPROBES)
+        {
+                .procname       = "kprobes-optimization",
+                .data           = &sysctl_kprobes_optimization,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_kprobes_optimization_handler,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
        { }
 };
@@ -2039,8 +2096,132 @@ int proc_dostring(struct ctl_table *table, int write,
                               buffer, lenp, ppos);
 }
+static size_t proc_skip_spaces(char **buf)
+{
+        size_t ret;
+        char *tmp = skip_spaces(*buf);
+        ret = tmp - *buf;
+        *buf = tmp;
+        return ret;
+}
+static void proc_skip_char(char **buf, size_t *size, const char v)
+{
+        while (*size) {
+                if (**buf != v)
+                        break;
+                (*size)--;
+                (*buf)++;
+        }
+}
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formatted integer from a user buffer
+ *
+ * @buf: a kernel buffer
+ * @size: size of the kernel buffer
+ * @val: this is where the number will be stored
+ * @neg: set to %TRUE if number is negative
+ * @perm_tr: a vector which contains the allowed trailers
+ * @perm_tr_len: size of the perm_tr vector
+ * @tr: pointer to store the trailer character
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes read. If @tr is non-NULL and a trailing
+ * character exists (size is non-zero after returning from this
+ * function), @tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+                          unsigned long *val, bool *neg,
+                          const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+        int len;
+        char *p, tmp[TMPBUFLEN];
+        if (!*size)
+                return -EINVAL;
+        len = *size;
+        if (len > TMPBUFLEN - 1)
+                len = TMPBUFLEN - 1;
+        memcpy(tmp, *buf, len);
+        tmp[len] = 0;
+        p = tmp;
+        if (*p == '-' && *size > 1) {
+                *neg = true;
+                p++;
+        } else
+                *neg = false;
+        if (!isdigit(*p))
+                return -EINVAL;
+        *val = simple_strtoul(p, &p, 0);
+        len = p - tmp;
+        /* We don't know if the next char is whitespace thus we may accept
+         * invalid integers (e.g. 1234...a) or two integers instead of one
+         * (e.g. 123...1). So lets not allow such large numbers. */
+        if (len == TMPBUFLEN - 1)
+                return -EINVAL;
+        if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+                return -EINVAL;
+        if (tr && (len < *size))
+                *tr = *p;
+        *buf += len;
+        *size -= len;
+        return 0;
+}
+/**
+ * proc_put_long - converts an integer to a decimal ASCII formatted string
+ *
+ * @buf: the user buffer
+ * @size: the size of the user buffer
+ * @val: the integer to be converted
+ * @neg: sign of the number, %TRUE for negative
+ *
+ * In case of success %0 is returned and @buf and @size are updated with
+ * the amount of bytes written.
+ */
+static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
+                          bool neg)
+{
+        int len;
+        char tmp[TMPBUFLEN], *p = tmp;
+        sprintf(p, "%s%lu", neg ? "-" : "", val);
+        len = strlen(tmp);
+        if (len > *size)
+                len = *size;
+        if (copy_to_user(*buf, tmp, len))
+                return -EFAULT;
+        *size -= len;
+        *buf += len;
+        return 0;
+}
+#undef TMPBUFLEN
+static int proc_put_char(void __user **buf, size_t *size, char c)
+{
+        if (*size) {
+                char __user **buffer = (char __user **)buf;
+                if (put_user(c, *buffer))
+                        return -EFAULT;
+                (*size)--, (*buffer)++;
+                *buf = *buffer;
+        }
+        return 0;
+}
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
                                 int *valp,
                                 int write, void *data)
 {
@@ -2049,33 +2230,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
        } else {
                int val = *valp;
                if (val < 0) {
-                        *negp = -1;
+                        *negp = true;
                        *lvalp = (unsigned long)-val;
                } else {
-                        *negp = 0;
+                        *negp = false;
                        *lvalp = (unsigned long)val;
                }
        }
        return 0;
 }
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
                  int write, void __user *buffer,
                  size_t *lenp, loff_t *ppos,
-                  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+                  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
                              int write, void *data),
                  void *data)
 {
-#define TMPBUFLEN 21
+        int *i, vleft, first = 1, err = 0;
-        int *i, vleft, first = 1, neg;
+        unsigned long page = 0;
-        unsigned long lval;
+        size_t left;
-        size_t left, len;
+        char *kbuf;
-        
-        char buf[TMPBUFLEN], *p;
-        char __user *s = buffer;
        
-        if (!tbl_data || !table->maxlen || !*lenp ||
+        if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
-            (*ppos && !write)) {
                *lenp = 0;
                return 0;
        }
@@ -2087,89 +2266,71 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
        if (!conv)
                conv = do_proc_dointvec_conv;
+        if (write) {
+                if (left > PAGE_SIZE - 1)
+                        left = PAGE_SIZE - 1;
+                page = __get_free_page(GFP_TEMPORARY);
+                kbuf = (char *) page;
+                if (!kbuf)
+                        return -ENOMEM;
+                if (copy_from_user(kbuf, buffer, left)) {
+                        err = -EFAULT;
+                        goto free;
+                }
+                kbuf[left] = 0;
+        }
        for (; left && vleft--; i++, first=0) {
+                unsigned long lval;
+                bool neg;
                if (write) {
-                        while (left) {
+                        left -= proc_skip_spaces(&kbuf);
-                                char c;
-                                if (get_user(c, s))
-                                        return -EFAULT;
-                                if (!isspace(c))
-                                        break;
-                                left--;
-                                s++;
-                        }
                        if (!left)
                                break;
-                        neg = 0;
+                        err = proc_get_long(&kbuf, &left, &lval, &neg,
-                        len = left;
+                                             proc_wspace_sep,
-                        if (len > sizeof(buf) - 1)
+                                             sizeof(proc_wspace_sep), NULL);
-                                len = sizeof(buf) - 1;
+                        if (err)
-                        if (copy_from_user(buf, s, len))
-                                return -EFAULT;
-                        buf[len] = 0;
-                        p = buf;
-                        if (*p == '-' && left > 1) {
-                                neg = 1;
-                                p++;
-                        }
-                        if (*p < '0' || *p > '9')
-                                break;
-                        lval = simple_strtoul(p, &p, 0);
-                        len = p-buf;
-                        if ((len < left) && *p && !isspace(*p))
                                break;
-                        s += len;
+                        if (conv(&neg, &lval, i, 1, data)) {
-                        left -= len;
+                                err = -EINVAL;
-                        if (conv(&neg, &lval, i, 1, data))
                                break;
+                        }
                } else {
-                        p = buf;
+                        if (conv(&neg, &lval, i, 0, data)) {
+                                err = -EINVAL;
+                                break;
+                        }
                        if (!first)
-                                *p++ = '\t';
+                                err = proc_put_char(&buffer, &left, '\t');
-        
+                        if (err)
-                        if (conv(&neg, &lval, i, 0, data))
+                                break;
+                        err = proc_put_long(&buffer, &left, lval, neg);
+                        if (err)
                                break;
-                        sprintf(p, "%s%lu", neg ? "-" : "", lval);
-                        len = strlen(buf);
-                        if (len > left)
-                                len = left;
-                        if(copy_to_user(s, buf, len))
-                                return -EFAULT;
-                        left -= len;
-                        s += len;
                }
        }
-        if (!write && !first && left) {
+        if (!write && !first && left && !err)
-                if(put_user('\n', s))
+                err = proc_put_char(&buffer, &left, '\n');
-                        return -EFAULT;
+        if (write && !err && left)
-                left--, s++;
+                left -= proc_skip_spaces(&kbuf);
-        }
+free:
        if (write) {
-                while (left) {
+                free_page(page);
-                        char c;
+                if (first)
-                        if (get_user(c, s++))
+                        return err ? : -EINVAL;
-                                return -EFAULT;
-                        if (!isspace(c))
-                                break;
-                        left--;
-                }
        }
-        if (write && first)
-                return -EINVAL;
        *lenp -= left;
        *ppos += *lenp;
-        return 0;
+        return err;
-#undef TMPBUFLEN
 }
 static int do_proc_dointvec(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos,
-                  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+                  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
                              int write, void *data),
                  void *data)
 {
@@ -2237,8 +2398,8 @@ struct do_proc_dointvec_minmax_conv_param {
        int *max;
 };
-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
-                                        int *valp, 
+                                        int *valp,
                                        int write, void *data)
 {
        struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2251,10 +2412,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
        } else {
                int val = *valp;
                if (val < 0) {
-                        *negp = -1;
+                        *negp = true;
                        *lvalp = (unsigned long)-val;
                } else {
-                        *negp = 0;
+                        *negp = false;
                        *lvalp = (unsigned long)val;
                }
        }
@@ -2294,102 +2455,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
                                     unsigned long convmul,
                                     unsigned long convdiv)
 {
-#define TMPBUFLEN 21
+        unsigned long *i, *min, *max;
-        unsigned long *i, *min, *max, val;
+        int vleft, first = 1, err = 0;
-        int vleft, first=1, neg;
+        unsigned long page = 0;
-        size_t len, left;
+        size_t left;
-        char buf[TMPBUFLEN], *p;
+        char *kbuf;
-        char __user *s = buffer;
-        
+        if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
-        if (!data || !table->maxlen || !*lenp ||
-            (*ppos && !write)) {
                *lenp = 0;
                return 0;
        }
-        
        i = (unsigned long *) data;
        min = (unsigned long *) table->extra1;
        max = (unsigned long *) table->extra2;
        vleft = table->maxlen / sizeof(unsigned long);
        left = *lenp;
-        
+        if (write) {
+                if (left > PAGE_SIZE - 1)
+                        left = PAGE_SIZE - 1;
+                page = __get_free_page(GFP_TEMPORARY);
+                kbuf = (char *) page;
+                if (!kbuf)
+                        return -ENOMEM;
+                if (copy_from_user(kbuf, buffer, left)) {
+                        err = -EFAULT;
+                        goto free;
+                }
+                kbuf[left] = 0;
+        }
        for (; left && vleft--; i++, min++, max++, first=0) {
+                unsigned long val;
                if (write) {
-                        while (left) {
+                        bool neg;
-                                char c;
-                                if (get_user(c, s))
+                        left -= proc_skip_spaces(&kbuf);
-                                        return -EFAULT;
-                                if (!isspace(c))
+                        err = proc_get_long(&kbuf, &left, &val, &neg,
-                                        break;
+                                             proc_wspace_sep,
-                                left--;
+                                             sizeof(proc_wspace_sep), NULL);
-                                s++;
+                        if (err)
-                        }
-                        if (!left)
-                                break;
-                        neg = 0;
-                        len = left;
-                        if (len > TMPBUFLEN-1)
-                                len = TMPBUFLEN-1;
-                        if (copy_from_user(buf, s, len))
-                                return -EFAULT;
-                        buf[len] = 0;
-                        p = buf;
-                        if (*p == '-' && left > 1) {
-                                neg = 1;
-                                p++;
-                        }
-                        if (*p < '0' || *p > '9')
-                                break;
-                        val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
-                        len = p-buf;
-                        if ((len < left) && *p && !isspace(*p))
                                break;
                        if (neg)
-                                val = -val;
-                        s += len;
-                        left -= len;
-                        if(neg)
                                continue;
                        if ((min && val < *min) || (max && val > *max))
                                continue;
                        *i = val;
                } else {
-                        p = buf;
+                        val = convdiv * (*i) / convmul;
                        if (!first)
-                                *p++ = '\t';
+                                err = proc_put_char(&buffer, &left, '\t');
-                        sprintf(p, "%lu", convdiv * (*i) / convmul);
+                        err = proc_put_long(&buffer, &left, val, false);
-                        len = strlen(buf);
+                        if (err)
-                        if (len > left)
+                                break;
-                                len = left;
-                        if(copy_to_user(s, buf, len))
-                                return -EFAULT;
-                        left -= len;
-                        s += len;
                }
        }
-        if (!write && !first && left) {
+        if (!write && !first && left && !err)
-                if(put_user('\n', s))
+                err = proc_put_char(&buffer, &left, '\n');
-                        return -EFAULT;
+        if (write && !err)
-                left--, s++;
+                left -= proc_skip_spaces(&kbuf);
-        }
+free:
        if (write) {
-                while (left) {
+                free_page(page);
-                        char c;
+                if (first)
-                        if (get_user(c, s++))
+                        return err ? : -EINVAL;
-                                return -EFAULT;
-                        if (!isspace(c))
-                                break;
-                        left--;
-                }
        }
-        if (write && first)
-                return -EINVAL;
        *lenp -= left;
        *ppos += *lenp;
-        return 0;
+        return err;
-#undef TMPBUFLEN
 }
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2450,7 +2587,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 }
-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
                                         int *valp,
                                         int write, void *data)
 {
@@ -2462,10 +2599,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
                int val = *valp;
                unsigned long lval;
                if (val < 0) {
-                        *negp = -1;
+                        *negp = true;
                        lval = (unsigned long)-val;
                } else {
-                        *negp = 0;
+                        *negp = false;
                        lval = (unsigned long)val;
                }
                *lvalp = lval / HZ;
@@ -2473,7 +2610,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
        return 0;
 }
-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
                                                int *valp,
                                                int write, void *data)
 {
@@ -2485,10 +2622,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
                int val = *valp;
                unsigned long lval;
                if (val < 0) {
-                        *negp = -1;
+                        *negp = true;
                        lval = (unsigned long)-val;
                } else {
-                        *negp = 0;
+                        *negp = false;
                        lval = (unsigned long)val;
                }
                *lvalp = jiffies_to_clock_t(lval);
@@ -2496,7 +2633,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
        return 0;
 }
-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
                                            int *valp,
                                            int write, void *data)
 {
@@ -2506,10 +2643,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
                int val = *valp;
                unsigned long lval;
                if (val < 0) {
-                        *negp = -1;
+                        *negp = true;
                        lval = (unsigned long)-val;
                } else {
-                        *negp = 0;
+                        *negp = false;
                        lval = (unsigned long)val;
                }
                *lvalp = jiffies_to_msecs(lval);
@@ -2606,6 +2743,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
        return 0;
 }
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+                         void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int err = 0;
+        bool first = 1;
+        size_t left = *lenp;
+        unsigned long bitmap_len = table->maxlen;
+        unsigned long *bitmap = (unsigned long *) table->data;
+        unsigned long *tmp_bitmap = NULL;
+        char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+        if (!bitmap_len || !left || (*ppos && !write)) {
+                *lenp = 0;
+                return 0;
+        }
+        if (write) {
+                unsigned long page = 0;
+                char *kbuf;
+                if (left > PAGE_SIZE - 1)
+                        left = PAGE_SIZE - 1;
+                page = __get_free_page(GFP_TEMPORARY);
+                kbuf = (char *) page;
+                if (!kbuf)
+                        return -ENOMEM;
+                if (copy_from_user(kbuf, buffer, left)) {
+                        free_page(page);
+                        return -EFAULT;
+                }
+                kbuf[left] = 0;
+                tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+                                     GFP_KERNEL);
+                if (!tmp_bitmap) {
+                        free_page(page);
+                        return -ENOMEM;
+                }
+                proc_skip_char(&kbuf, &left, '\n');
+                while (!err && left) {
+                        unsigned long val_a, val_b;
+                        bool neg;
+                        err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
+                                             sizeof(tr_a), &c);
+                        if (err)
+                                break;
+                        if (val_a >= bitmap_len || neg) {
+                                err = -EINVAL;
+                                break;
+                        }
+                        val_b = val_a;
+                        if (left) {
+                                kbuf++;
+                                left--;
+                        }
+                        if (c == '-') {
+                                err = proc_get_long(&kbuf, &left, &val_b,
+                                                     &neg, tr_b, sizeof(tr_b),
+                                                     &c);
+                                if (err)
+                                        break;
+                                if (val_b >= bitmap_len || neg ||
+                                    val_a > val_b) {
+                                        err = -EINVAL;
+                                        break;
+                                }
+                                if (left) {
+                                        kbuf++;
+                                        left--;
+                                }
+                        }
+                        while (val_a <= val_b)
+                                set_bit(val_a++, tmp_bitmap);
+                        first = 0;
+                        proc_skip_char(&kbuf, &left, '\n');
+                }
+                free_page(page);
+        } else {
+                unsigned long bit_a, bit_b = 0;
+                while (left) {
+                        bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+                        if (bit_a >= bitmap_len)
+                                break;
+                        bit_b = find_next_zero_bit(bitmap, bitmap_len,
+                                                   bit_a + 1) - 1;
+                        if (!first) {
+                                err = proc_put_char(&buffer, &left, ',');
+                                if (err)
+                                        break;
+                        }
+                        err = proc_put_long(&buffer, &left, bit_a, false);
+                        if (err)
+                                break;
+                        if (bit_a != bit_b) {
+                                err = proc_put_char(&buffer, &left, '-');
+                                if (err)
+                                        break;
+                                err = proc_put_long(&buffer, &left, bit_b, false);
+                                if (err)
+                                        break;
+                        }
+                        first = 0; bit_b++;
+                }
+                if (!err)
+                        err = proc_put_char(&buffer, &left, '\n');
+        }
+        if (!err) {
+                if (write) {
+                        if (*ppos)
+                                bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+                        else
+                                memcpy(bitmap, tmp_bitmap,
+                                        BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+                }
+                kfree(tmp_bitmap);
+                *lenp -= left;
+                *ppos += *lenp;
+                return 0;
+        } else {
+                kfree(tmp_bitmap);
+                return err;
+        }
+}
 #else /* CONFIG_PROC_FS */
 int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,8 @@
 #include <linux/file.h>
 #include <linux/ctype.h>
 #include <linux/netdevice.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
 #ifdef CONFIG_SYSCTL_SYSCALL
@@ -223,7 +225,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
        { CTL_INT,      NET_IPV4_ROUTE_MTU_EXPIRES,             "mtu_expires" },
        { CTL_INT,      NET_IPV4_ROUTE_MIN_PMTU,                "min_pmtu" },
        { CTL_INT,      NET_IPV4_ROUTE_MIN_ADVMSS,              "min_adv_mss" },
-        { CTL_INT,      NET_IPV4_ROUTE_SECRET_INTERVAL,         "secret_interval" },
        {}
 };
@@ -1124,11 +1125,6 @@ out:
        return result;
 }
-static unsigned hex_value(int ch)
-{
-        return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
-}
 static ssize_t bin_uuid(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
                        if (!isxdigit(str[0]) || !isxdigit(str[1]))
                                goto out;
-                        uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
+                        uuid[i] = (hex_to_bin(str[0]) << 4) |
+                                        hex_to_bin(str[1]);
                        str += 2;
                        if (*str == '-')
                                str++;
@@ -1331,7 +1328,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        ssize_t result;
        char *pathname;
        int flags;
-        int acc_mode, fmode;
+        int acc_mode;
        pathname = sysctl_getname(name, nlen, &table);
        result = PTR_ERR(pathname);
@@ -1342,15 +1339,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        if (oldval && oldlen && newval && newlen) {
                flags = O_RDWR;
                acc_mode = MAY_READ | MAY_WRITE;
-                fmode = FMODE_READ | FMODE_WRITE;
        } else if (newval && newlen) {
                flags = O_WRONLY;
                acc_mode = MAY_WRITE;
-                fmode = FMODE_WRITE;
        } else if (oldval && oldlen) {
                flags = O_RDONLY;
                acc_mode = MAY_READ;
-                fmode = FMODE_READ;
        } else {
                result = 0;
                goto out_putname;
@@ -1361,7 +1355,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        if (result)
                goto out_putname;
-        result = may_open(&nd.path, acc_mode, fmode);
+        result = may_open(&nd.path, acc_mode, flags);
        if (result)
                goto out_putpath;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/cgroupstats.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
        .maxattr        = TASKSTATS_CMD_ATTR_MAX,
 };
-static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
-__read_mostly = {
        [TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
        [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
-static struct nla_policy
+static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
-cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
        [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..848b1c2ab09a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ptrace.h>
@@ -133,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
 */
 static inline void warp_clock(void)
 {
-        write_seqlock_irq(&xtime_lock);
+        struct timespec adjust;
-        wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
-        xtime.tv_sec += sys_tz.tz_minuteswest * 60;
+        adjust = current_kernel_time();
-        update_xtime_cache(0);
+        adjust.tv_sec += sys_tz.tz_minuteswest * 60;
-        write_sequnlock_irq(&xtime_lock);
+        do_settimeofday(&adjust);
-        clock_was_set();
 }
 /*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&watchdog_lock, flags);
+        /*
+         * We use trylock here to avoid a potential dead lock when
+         * kgdb calls this code after the kernel has been stopped with
+         * watchdog_lock held. When watchdog_lock is held we just
+         * return and accept, that the watchdog might trigger and mark
+         * the monitored clock source (usually TSC) unstable.
+         *
+         * This does not affect the other caller clocksource_resume()
+         * because at this point the kernel is UP, interrupts are
+         * disabled and nothing can hold watchdog_lock.
+         */
+        if (!spin_trylock_irqsave(&watchdog_lock, flags))
+                return;
        clocksource_reset_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 /**
+ * clocksource_suspend - suspend the clocksource(s)
+ */
+void clocksource_suspend(void)
+{
+        struct clocksource *cs;
+        list_for_each_entry_reverse(cs, &clocksource_list, list)
+                if (cs->suspend)
+                        cs->suspend(cs);
+}
+/**
 * clocksource_resume - resume the clocksource(s)
 */
 void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
        list_for_each_entry(cs, &clocksource_list, list)
                if (cs->resume)
-                        cs->resume();
+                        cs->resume(cs);
        clocksource_resume_watchdog();
 }
@@ -458,8 +482,8 @@ void clocksource_resume(void)
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
+ * to incorrectly trip the watchdog. This might fail when the kernel
- *
+ * was stopped in code which holds watchdog_lock.
 */
 void clocksource_touch_watchdog(void)
 {
@@ -568,6 +592,10 @@ static inline void clocksource_select(void) { }
 */
 static int __init clocksource_done_booting(void)
 {
+        mutex_lock(&clocksource_mutex);
+        curr_clocksource = clocksource_default_clock();
+        mutex_unlock(&clocksource_mutex);
        finished_booting = 1;
        /*
@@ -597,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
        list_add(&cs->list, entry);
 }
+/*
+ * Maximum time we expect to go between ticks. This includes idle
+ * tickless time. It provides the trade off between selecting a
+ * mult/shift pair that is very precise but can only handle a short
+ * period of time, vs. a mult/shift pair that can handle long periods
+ * of time but isn't as precise.
+ *
+ * This is a subsystem constant, and actual hardware limitations
+ * may override it (ie: clocksources that wrap every 3 seconds).
+ */
+#define MAX_UPDATE_LENGTH 5 /* Seconds */
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t:          clocksource to be registered
+ * @scale:      Scale factor multiplied against freq to get clocksource hz
+ * @freq:       clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+        /*
+         * Ideally we want to use  some of the limits used in
+         * clocksource_max_deferment, to provide a more informed
+         * MAX_UPDATE_LENGTH. But for now this just gets the
+         * register interface working properly.
+         */
+        clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+                                      NSEC_PER_SEC/scale,
+                                      MAX_UPDATE_LENGTH*scale);
+        cs->max_idle_ns = clocksource_max_deferment(cs);
+        mutex_lock(&clocksource_mutex);
+        clocksource_enqueue(cs);
+        clocksource_select();
+        clocksource_enqueue_watchdog(cs);
+        mutex_unlock(&clocksource_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 /**
 * clocksource_register - Used to install new clocksources
 * @t:          clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64			time_offset;
 static long                     time_constant = 2;
 /* maximum error (usecs):                                               */
-long                            time_maxerror = NTP_PHASE_LIMIT;
+static long                     time_maxerror = NTP_PHASE_LIMIT;
 /* estimated error (usecs):                                             */
-long                            time_esterror = NTP_PHASE_LIMIT;
+static long                     time_esterror = NTP_PHASE_LIMIT;
 /* frequency offset (scaled nsecs/secs):                                */
 static s64                      time_freq;
@@ -69,7 +69,7 @@ static s64			time_freq;
 /* time at last adjustment (secs):                                      */
 static long                     time_reftime;
-long                            time_adjust;
+static long                     time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)    */
 static s64                      ntp_tick_adj;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
         * Select how the frequency is to be controlled
         * and in which mode (PLL or FLL).
         */
-        secs = xtime.tv_sec - time_reftime;
+        secs = get_seconds() - time_reftime;
        if (unlikely(time_status & STA_FREQHOLD))
                secs = 0;
-        time_reftime = xtime.tv_sec;
+        time_reftime = get_seconds();
        offset64    = offset;
        freq_adj    = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
         * reference time to current time.
         */
        if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
-                time_reftime = xtime.tv_sec;
+                time_reftime = get_seconds();
        /* only set allowed bits */
        time_status &= STA_RONLY;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
 #include "tick-internal.h"
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+        /* Nothing to do if we already reached the limit */
+        if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+                return -ETIME;
+        if (dev->min_delta_ns < 5000)
+                dev->min_delta_ns = 5000;
+        else
+                dev->min_delta_ns += dev->min_delta_ns >> 1;
+        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+                dev->min_delta_ns = MIN_DELTA_LIMIT;
+        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+               dev->name ? dev->name : "?",
+               (unsigned long long) dev->min_delta_ns);
+        return 0;
+}
 /**
 * tick_program_event internal worker function
 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
                if (!ret || !force)
                        return ret;
+                dev->retries++;
                /*
-                 * We tried 2 times to program the device with the given
+                 * We tried 3 times to program the device with the given
-                 * min_delta_ns. If that's not working then we double it
+                 * min_delta_ns. If that's not working then we increase it
                 * and emit a warning.
                 */
                if (++i > 2) {
                        /* Increase the min. delta and try again */
-                        if (!dev->min_delta_ns)
+                        if (tick_increase_min_delta(dev)) {
-                                dev->min_delta_ns = 5000;
+                                /*
-                        else
+                                 * Get out of the loop if min_delta_ns
-                                dev->min_delta_ns += dev->min_delta_ns >> 1;
+                                 * hit the limit already. That's
+                                 * better than staying here forever.
-                        printk(KERN_WARNING
+                                 *
-                               "CE: %s increasing min_delta_ns to %llu nsec\n",
+                                 * We clear next_event so we have a
-                               dev->name ? dev->name : "?",
+                                 * chance that the box survives.
-                               (unsigned long long) dev->min_delta_ns << 1);
+                                 */
+                                printk(KERN_WARNING
+                                       "CE: Reprogramming failure. Giving up\n");
+                                dev->next_event.tv64 = KTIME_MAX;
+                                return -ETIME;
+                        }
                        i = 0;
                }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..813993b5fb61 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,35 +150,65 @@ static void tick_nohz_update_jiffies(ktime_t now)
        touch_softlockup_watchdog();
 }
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+        ktime_t delta;
+        if (ts->idle_active) {
+                delta = ktime_sub(now, ts->idle_entrytime);
+                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+                if (nr_iowait_cpu(cpu) > 0)
+                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+                ts->idle_entrytime = now;
+        }
+        if (last_update_time)
+                *last_update_time = ktime_to_us(now);
+}
 static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t delta;
-        delta = ktime_sub(now, ts->idle_entrytime);
+        update_ts_time_stats(cpu, ts, now, NULL);
-        ts->idle_lastupdate = now;
-        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
        ts->idle_active = 0;
        sched_clock_idle_wakeup_event(0);
 }
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
 {
-        ktime_t now, delta;
+        ktime_t now;
        now = ktime_get();
-        if (ts->idle_active) {
-                delta = ktime_sub(now, ts->idle_entrytime);
+        update_ts_time_stats(cpu, ts, now, NULL);
-                ts->idle_lastupdate = now;
-                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-        }
        ts->idle_entrytime = now;
        ts->idle_active = 1;
        sched_clock_idle_sleep_event();
        return now;
 }
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
        if (!tick_nohz_enabled)
                return -1;
-        if (ts->idle_active)
+        update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
-                *last_update_time = ktime_to_us(ts->idle_lastupdate);
-        else
-                *last_update_time = ktime_to_us(ktime_get());
        return ktime_to_us(ts->idle_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        if (!tick_nohz_enabled)
+                return -1;
+        update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
+        return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 /**
 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
 *
@@ -231,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
         */
        ts->inidle = 1;
-        now = tick_nohz_start_idle(ts);
+        now = tick_nohz_start_idle(cpu, ts);
        /*
         * If this cpu is offline and it is the one which updates
@@ -272,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        } while (read_seqretry(&xtime_lock, seq));
        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-            arch_needs_cpu(cpu)) {
+            arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
        } else {
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090be..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
 #include <linux/timecompare.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4f..caf8d4d4f5c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
-        xtime_cache = xtime;
-        timespec_add_ns(&xtime_cache, nsec);
-}
 /* must hold xtime_lock */
 void timekeeping_leap_insert(int leapsecond)
 {
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
        xtime = *tv;
-        update_xtime_cache(0);
        timekeeper.ntp_error = 0;
        ntp_clear();
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
        }
        set_normalized_timespec(&wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
-        update_xtime_cache(0);
        total_sleep_time.tv_sec = 0;
        total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
                total_sleep_time = timespec_add_safe(total_sleep_time, ts);
        }
-        update_xtime_cache(0);
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
@@ -622,6 +611,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
        write_sequnlock_irqrestore(&xtime_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+        clocksource_suspend();
        return 0;
 }
@@ -787,7 +777,6 @@ void update_wall_time(void)
 {
        struct clocksource *clock;
        cycle_t offset;
-        u64 nsecs;
        int shift = 0, maxshift;
        /* Make sure we're fully resumed: */
@@ -817,7 +806,8 @@ void update_wall_time(void)
        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
                offset = logarithmic_accumulation(offset, shift);
-                shift--;
+                if(offset < timekeeper.cycle_interval<<shift)
+                        shift--;
        }
        /* correct the clock when NTP error is too big */
@@ -845,7 +835,9 @@ void update_wall_time(void)
                timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
        }
-        /* store full nanoseconds into xtime after rounding it up and
+        /*
+         * Store full nanoseconds into xtime after rounding it up and
         * add the remainder to the error difference.
         */
        xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -853,8 +845,15 @@ void update_wall_time(void)
        timekeeper.ntp_error += timekeeper.xtime_nsec <<
                                timekeeper.ntp_error_shift;
-        nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
+        /*
-        update_xtime_cache(nsecs);
+         * Finally, make sure that after the rounding
+         * xtime.tv_nsec isn't larger then NSEC_PER_SEC
+         */
+        if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
+                xtime.tv_nsec -= NSEC_PER_SEC;
+                xtime.tv_sec++;
+                second_overflow();
+        }
        /* check to see if there is a new clocksource to use */
        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -880,6 +879,7 @@ void getboottime(struct timespec *ts)
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
+EXPORT_SYMBOL_GPL(getboottime);
 /**
 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,16 +889,17 @@ void monotonic_to_bootbased(struct timespec *ts)
 {
        *ts = timespec_add_safe(*ts, total_sleep_time);
 }
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
-        return xtime_cache.tv_sec;
+        return xtime.tv_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return xtime_cache;
+        return xtime;
 }
 struct timespec current_kernel_time(void)
@@ -909,7 +910,7 @@ struct timespec current_kernel_time(void)
        do {
                seq = read_seqbegin(&xtime_lock);
-                now = xtime_cache;
+                now = xtime;
        } while (read_seqretry(&xtime_lock, seq));
        return now;
@@ -924,7 +925,7 @@ struct timespec get_monotonic_coarse(void)
        do {
                seq = read_seqbegin(&xtime_lock);
-                now = xtime_cache;
+                now = xtime;
                mono = wall_to_monotonic;
        } while (read_seqretry(&xtime_lock, seq));
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
                P_ns(idle_waketime);
                P_ns(idle_exittime);
                P_ns(idle_sleeptime);
+                P_ns(iowait_sleeptime);
                P(last_jiffies);
                P(next_jiffies);
                P_ns(idle_expires);
@@ -228,6 +229,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        SEQ_printf(m, " event_handler:  ");
        print_name_offset(m, dev->event_handler);
        SEQ_printf(m, "\n");
+        SEQ_printf(m, " retries:        %lu\n", dev->retries);
 }
 static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.5\n");
+        SEQ_printf(m, "Timer List Version: v0.6\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..ee305c8d4e18 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -318,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+/**
+ * set_timer_slack - set the allowed slack for a timer
+ * @slack_hz: the amount of time (in jiffies) allowed for rounding
+ *
+ * Set the amount of time, in jiffies, that a certain timer has
+ * in terms of slack. By setting this value, the timer subsystem
+ * will schedule the actual timer somewhere between
+ * the time mod_timer() asks for, and that time plus the slack.
+ *
+ * By setting the slack to -1, a percentage of the delay is used
+ * instead.
+ */
+void set_timer_slack(struct timer_list *timer, int slack_hz)
+{
+        timer->slack = slack_hz;
+}
+EXPORT_SYMBOL_GPL(set_timer_slack);
 static inline void set_running_timer(struct tvec_base *base,
                                        struct timer_list *timer)
@@ -549,6 +568,7 @@ static void __init_timer(struct timer_list *timer,
 {
        timer->entry.next = NULL;
        timer->base = __raw_get_cpu_var(tvec_bases);
+        timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
        timer->start_site = NULL;
        timer->start_pid = -1;
@@ -714,6 +734,46 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 }
 EXPORT_SYMBOL(mod_timer_pending);
+/*
+ * Decide where to put the timer while taking the slack into account
+ *
+ * Algorithm:
+ *   1) calculate the maximum (absolute) time
+ *   2) calculate the highest bit where the expires and new max are different
+ *   3) use this bit to make a mask
+ *   4) use the bitmask to round down the maximum time, so that all last
+ *      bits are zeros
+ */
+static inline
+unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
+{
+        unsigned long expires_limit, mask;
+        int bit;
+        expires_limit = expires;
+        if (timer->slack >= 0) {
+                expires_limit = expires + timer->slack;
+        } else {
+                unsigned long now = jiffies;
+                /* No slack, if already expired else auto slack 0.4% */
+                if (time_after(expires, now))
+                        expires_limit = expires + (expires - now)/256;
+        }
+        mask = expires ^ expires_limit;
+        if (mask == 0)
+                return expires;
+        bit = find_last_bit(&mask, BITS_PER_LONG);
+        mask = (1 << bit) - 1;
+        expires_limit = expires_limit & ~(mask);
+        return expires_limit;
+}
 /**
 * mod_timer - modify a timer's timeout
 * @timer: the timer to be modified
@@ -744,6 +804,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
        if (timer_pending(timer) && timer->expires == expires)
                return 1;
+        expires = apply_slack(timer, expires);
        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
@@ -880,6 +942,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
        if (base->running_timer == timer)
                goto out;
+        timer_stats_timer_clear_start_info(timer);
        ret = 0;
        if (timer_pending(timer)) {
                detach_timer(timer, 1);
@@ -953,6 +1016,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
        return index;
 }
+static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
+                          unsigned long data)
+{
+        int preempt_count = preempt_count();
+#ifdef CONFIG_LOCKDEP
+        /*
+         * It is permissible to free the timer from inside the
+         * function that is called from it, this we need to take into
+         * account for lockdep too. To avoid bogus "held lock freed"
+         * warnings as well as problems when looking into
+         * timer->lockdep_map, make a copy and use that here.
+         */
+        struct lockdep_map lockdep_map = timer->lockdep_map;
+#endif
+        /*
+         * Couple the lock chain with the lock chain at
+         * del_timer_sync() by acquiring the lock_map around the fn()
+         * call here and in del_timer_sync().
+         */
+        lock_map_acquire(&lockdep_map);
+        trace_timer_expire_entry(timer);
+        fn(data);
+        trace_timer_expire_exit(timer);
+        lock_map_release(&lockdep_map);
+        if (preempt_count != preempt_count()) {
+                WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+                          fn, preempt_count, preempt_count());
+                /*
+                 * Restore the preempt count. That gives us a decent
+                 * chance to survive and extract information. If the
+                 * callback kept a lock held, bad luck, but not worse
+                 * than the BUG() we had.
+                 */
+                preempt_count() = preempt_count;
+        }
+}
 #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 /**
@@ -996,45 +1100,7 @@ static inline void __run_timers(struct tvec_base *base)
                        detach_timer(timer, 1);
                        spin_unlock_irq(&base->lock);
-                        {
+                        call_timer_fn(timer, fn, data);
-                                int preempt_count = preempt_count();
-#ifdef CONFIG_LOCKDEP
-                                /*
-                                 * It is permissible to free the timer from
-                                 * inside the function that is called from
-                                 * it, this we need to take into account for
-                                 * lockdep too. To avoid bogus "held lock
-                                 * freed" warnings as well as problems when
-                                 * looking into timer->lockdep_map, make a
-                                 * copy and use that here.
-                                 */
-                                struct lockdep_map lockdep_map =
-                                        timer->lockdep_map;
-#endif
-                                /*
-                                 * Couple the lock chain with the lock chain at
-                                 * del_timer_sync() by acquiring the lock_map
-                                 * around the fn() call here and in
-                                 * del_timer_sync().
-                                 */
-                                lock_map_acquire(&lockdep_map);
-                                trace_timer_expire_entry(timer);
-                                fn(data);
-                                trace_timer_expire_exit(timer);
-                                lock_map_release(&lockdep_map);
-                                if (preempt_count != preempt_count()) {
-                                        printk(KERN_ERR "huh, entered %p "
-                                               "with preempt_count %08x, exited"
-                                               " with %08x?\n",
-                                               fn, preempt_count,
-                                               preempt_count());
-                                        BUG();
-                                }
-                        }
                        spin_lock_irq(&base->lock);
                }
        }
@@ -1618,11 +1684,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
                                unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
+        int err;
        switch(action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                if (init_timers_cpu(cpu) < 0)
+                err = init_timers_cpu(cpu);
-                        return NOTIFY_BAD;
+                if (err < 0)
+                        return notifier_from_errno(err);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
@@ -1648,7 +1717,7 @@ void __init init_timers(void)
        init_timer_stats();
-        BUG_ON(err == NOTIFY_BAD);
+        BUG_ON(err != NOTIFY_OK);
        register_cpu_notifier(&timers_nb);
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f289..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
 config HAVE_FUNCTION_GRAPH_FP_TEST
        bool
        help
-         An arch may pass in a unique value (frame pointer) to both the
+          See Documentation/trace/ftrace-design.txt
-         entering and exiting of a function. On exit, the value is compared
-         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
@@ -46,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
        help
          See Documentation/trace/ftrace-design.txt
-config HAVE_HW_BRANCH_TRACER
-        bool
 config HAVE_SYSCALL_TRACEPOINTS
        bool
        help
@@ -330,15 +325,6 @@ config BRANCH_TRACER
          Say N if unsure.
-config POWER_TRACER
-        bool "Trace power consumption behavior"
-        depends on X86
-        select GENERIC_TRACER
-        help
-          This tracer helps developers to analyze and optimize the kernel's
-          power management decisions, specifically the C-state and P-state
-          behavior.
 config KSYM_TRACER
        bool "Trace read and write access on kernel memory locations"
        depends on HAVE_HW_BREAKPOINT
@@ -385,14 +371,6 @@ config STACK_TRACER
          Say N if unsure.
-config HW_BRANCH_TRACER
-        depends on HAVE_HW_BRANCH_TRACER
-        bool "Trace hw branches"
-        select GENERIC_TRACER
-        help
-          This tracer records all branches on the system in a circular
-          buffer, giving access to the last N branches for each cpu.
 config KMEMTRACE
        bool "Trace SLAB allocations"
        select GENERIC_TRACER
@@ -451,7 +429,7 @@ config BLK_DEV_IO_TRACE
 config KPROBE_EVENT
        depends on KPROBES
-        depends on X86
+        depends on HAVE_REGS_AND_STACK_ACCESS_API
        bool "Enable kprobes-based dynamic events"
        select TRACING
        default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -51,7 +50,9 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
-obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..638711c17504 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/smp_lock.h>
 #include <linux/time.h>
@@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (ret)
                return ret;
-        if (copy_to_user(arg, &buts, sizeof(buts)))
+        if (copy_to_user(arg, &buts, sizeof(buts))) {
+                blk_trace_remove(q);
                return -EFAULT;
+        }
        return 0;
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
@@ -673,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        }
 }
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+                                   struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_ABORT);
 }
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+                                    struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 }
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+                                   struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 }
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+                                     struct request_queue *q,
                                     struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
 }
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+                                      struct request_queue *q,
                                      struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -722,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
                        !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
 }
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+                                     struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
 }
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+                                       struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
 }
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+                                        struct request_queue *q,
                                        struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
 }
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+                                         struct request_queue *q,
                                         struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
 }
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+                                    struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
 }
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+                                struct request_queue *q,
                                struct bio *bio, int rw)
 {
        if (bio)
@@ -763,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
 }
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+                                  struct request_queue *q,
                                  struct bio *bio, int rw)
 {
        if (bio)
@@ -777,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
        }
 }
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -785,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
                __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -798,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
        }
 }
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -811,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
        }
 }
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+                                struct request_queue *q, struct bio *bio,
                                unsigned int pdu)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -827,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
 /**
 * blk_add_trace_remap - Add a trace for a remap operation
+ * @ignore:     trace callback data parameter (not used)
 * @q:          queue the io is for
 * @bio:        the source bio
 * @dev:        target device
@@ -837,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
 *     it spans a stripe (or similar). Add a trace for that action.
 *
 **/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_remap(void *ignore,
-                                       dev_t dev, sector_t from)
+                                struct request_queue *q, struct bio *bio,
+                                dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -857,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 /**
 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @ignore:     trace callback data parameter (not used)
 * @q:          queue the io is for
 * @rq:         the source request
 * @dev:        target device
@@ -867,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 *     Add a trace for that action.
 *
 **/
-static void blk_add_trace_rq_remap(struct request_queue *q,
+static void blk_add_trace_rq_remap(void *ignore,
+                                   struct request_queue *q,
                                   struct request *rq, dev_t dev,
                                   sector_t from)
 {
@@ -919,64 +938,64 @@ static void blk_register_tracepoints(void)
 {
        int ret;
-        ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+        ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+        ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+        ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+        ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+        ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+        ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+        ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+        ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_getrq(blk_add_trace_getrq);
+        ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+        ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_plug(blk_add_trace_plug);
+        ret = register_trace_block_plug(blk_add_trace_plug, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+        ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+        ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_split(blk_add_trace_split);
+        ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_remap(blk_add_trace_remap);
+        ret = register_trace_block_remap(blk_add_trace_remap, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        WARN_ON(ret);
 }
 static void blk_unregister_tracepoints(void)
 {
-        unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
+        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
-        unregister_trace_block_remap(blk_add_trace_remap);
+        unregister_trace_block_remap(blk_add_trace_remap, NULL);
-        unregister_trace_block_split(blk_add_trace_split);
+        unregister_trace_block_split(blk_add_trace_split, NULL);
-        unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+        unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
-        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
-        unregister_trace_block_plug(blk_add_trace_plug);
+        unregister_trace_block_plug(blk_add_trace_plug, NULL);
-        unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+        unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
-        unregister_trace_block_getrq(blk_add_trace_getrq);
+        unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
-        unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+        unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
-        unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+        unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
-        unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+        unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
-        unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+        unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
-        unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+        unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
-        unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+        unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
-        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
-        unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+        unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
-        unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+        unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
-        unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+        unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
        tracepoint_synchronize_unregister();
 }
@@ -1319,7 +1338,7 @@ out:
 }
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-                                               int flags)
+                                               int flags, struct trace_event *event)
 {
        return print_one_line(iter, false);
 }
@@ -1341,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 }
 static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+                             struct trace_event *event)
 {
        return blk_trace_synthesize_old_trace(iter) ?
                        TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1379,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = {
        .set_flag       = blk_tracer_set_flag,
 };
-static struct trace_event trace_blk_event = {
+static struct trace_event_functions trace_blk_event_funcs = {
-        .type           = TRACE_BLK,
        .trace          = blk_trace_event_print,
        .binary         = blk_trace_event_print_binary,
 };
+static struct trace_event trace_blk_event = {
+        .type           = TRACE_BLK,
+        .funcs          = &trace_blk_event_funcs,
+};
 static int __init init_blk_tracer(void)
 {
        if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f80454..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,12 +22,13 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/hash.h>
+#include <linux/rcupdate.h>
 #include <trace/events/sched.h>
@@ -85,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
-static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+ * Traverse the ftrace_list, invoking all entries.  The reason that we
-#endif
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
-        struct ftrace_ops *op = ftrace_list;
+        struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
-        /* in case someone actually ports this to alpha! */
-        read_barrier_depends();
        while (op != &ftrace_list_end) {
-                /* silly alpha */
-                read_barrier_depends();
                op->func(ip, parent_ip);
-                op = op->next;
+                op = rcu_dereference_raw(op->next); /*see above*/
        };
 }
@@ -155,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
         * the ops->next pointer is valid before another CPU sees
         * the ops pointer included into the ftrace_list.
         */
-        smp_wmb();
+        rcu_assign_pointer(ftrace_list, ops);
-        ftrace_list = ops;
        if (ftrace_enabled) {
                ftrace_func_t func;
@@ -264,6 +264,7 @@ struct ftrace_profile {
        unsigned long                   counter;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        unsigned long long              time;
+        unsigned long long              time_squared;
 #endif
 };
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        seq_printf(m, "  Function                               "
-                   "Hit    Time            Avg\n"
+                   "Hit    Time            Avg             s^2\n"
                      "  --------                               "
-                   "---    ----            ---\n");
+                   "---    ----            ---             ---\n");
 #else
        seq_printf(m, "  Function                               Hit\n"
                      "  --------                               ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
        static DEFINE_MUTEX(mutex);
        static struct trace_seq s;
        unsigned long long avg;
+        unsigned long long stddev;
 #endif
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
        avg = rec->time;
        do_div(avg, rec->counter);
+        /* Sample standard deviation (s^2) */
+        if (rec->counter <= 1)
+                stddev = 0;
+        else {
+                stddev = rec->time_squared - rec->counter * avg * avg;
+                /*
+                 * Divide only 1000 for ns^2 -> us^2 conversion.
+                 * trace_print_graph_duration will divide 1000 again.
+                 */
+                do_div(stddev, (rec->counter - 1) * 1000);
+        }
        mutex_lock(&mutex);
        trace_seq_init(&s);
        trace_print_graph_duration(rec->time, &s);
        trace_seq_puts(&s, "    ");
        trace_print_graph_duration(avg, &s);
+        trace_seq_puts(&s, "    ");
+        trace_print_graph_duration(stddev, &s);
        trace_print_seq(m, &s);
        mutex_unlock(&mutex);
 #endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
        if (!stat->hash || !ftrace_profile_enabled)
                goto out;
+        /* If the calltime was zero'd ignore it */
+        if (!trace->calltime)
+                goto out;
        calltime = trace->rettime - trace->calltime;
        if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
        }
        rec = ftrace_find_profiled_func(stat, trace->func);
-        if (rec)
+        if (rec) {
                rec->time += calltime;
+                rec->time_squared += calltime * calltime;
+        }
 out:
        local_irq_restore(flags);
@@ -898,36 +920,6 @@ static struct dyn_ftrace *ftrace_free_records;
                }                               \
        }
-#ifdef CONFIG_KPROBES
-static int frozen_record_count;
-static inline void freeze_record(struct dyn_ftrace *rec)
-{
-        if (!(rec->flags & FTRACE_FL_FROZEN)) {
-                rec->flags |= FTRACE_FL_FROZEN;
-                frozen_record_count++;
-        }
-}
-static inline void unfreeze_record(struct dyn_ftrace *rec)
-{
-        if (rec->flags & FTRACE_FL_FROZEN) {
-                rec->flags &= ~FTRACE_FL_FROZEN;
-                frozen_record_count--;
-        }
-}
-static inline int record_frozen(struct dyn_ftrace *rec)
-{
-        return rec->flags & FTRACE_FL_FROZEN;
-}
-#else
-# define freeze_record(rec)                     ({ 0; })
-# define unfreeze_record(rec)                   ({ 0; })
-# define record_frozen(rec)                     ({ 0; })
-#endif /* CONFIG_KPROBES */
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
        rec->freelist = ftrace_free_records;
@@ -1025,6 +1017,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 }
+/* Return 1 if the address range is reserved for ftrace */
+int ftrace_text_reserved(void *start, void *end)
+{
+        struct dyn_ftrace *rec;
+        struct ftrace_page *pg;
+        do_for_each_ftrace_rec(pg, rec) {
+                if (rec->ip <= (unsigned long)end &&
+                    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
+                        return 1;
+        } while_for_each_ftrace_rec();
+        return 0;
+}
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
@@ -1076,14 +1083,6 @@ static void ftrace_replace_code(int enable)
                    !(rec->flags & FTRACE_FL_CONVERTED))
                        continue;
-                /* ignore updates to this record's mcount site */
-                if (get_kprobe((void *)rec->ip)) {
-                        freeze_record(rec);
-                        continue;
-                } else {
-                        unfreeze_record(rec);
-                }
                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
                        rec->flags |= FTRACE_FL_FAILED;
@@ -2300,6 +2299,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
 static int __init set_graph_function(char *str)
 {
        strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -2426,6 +2427,7 @@ static const struct file_operations ftrace_notrace_fops = {
 static DEFINE_MUTEX(graph_lock);
 int ftrace_graph_count;
+int ftrace_graph_filter_enabled;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 static void *
@@ -2448,7 +2450,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
        mutex_lock(&graph_lock);
        /* Nothing, tell g_show to print all functions are enabled */
-        if (!ftrace_graph_count && !*pos)
+        if (!ftrace_graph_filter_enabled && !*pos)
                return (void *)1;
        return __g_next(m, pos);
@@ -2494,6 +2496,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
        mutex_lock(&graph_lock);
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC)) {
+                ftrace_graph_filter_enabled = 0;
                ftrace_graph_count = 0;
                memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
        }
@@ -2519,7 +2522,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
        int search_len;
-        int found = 0;
+        int fail = 1;
        int type, not;
        char *search;
        bool exists;
@@ -2530,37 +2533,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        /* decode regex */
        type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
-        if (not)
+        if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
-                return -EINVAL;
+                return -EBUSY;
        search_len = strlen(search);
        mutex_lock(&ftrace_lock);
        do_for_each_ftrace_rec(pg, rec) {
-                if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
-                        break;
                if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
                        continue;
                if (ftrace_match_record(rec, search, search_len, type)) {
-                        /* ensure it is not already in the array */
+                        /* if it is in the array */
                        exists = false;
-                        for (i = 0; i < *idx; i++)
+                        for (i = 0; i < *idx; i++) {
                                if (array[i] == rec->ip) {
                                        exists = true;
                                        break;
                                }
-                        if (!exists)
+                        }
-                                array[(*idx)++] = rec->ip;
-                        found = 1;
+                        if (!not) {
+                                fail = 0;
+                                if (!exists) {
+                                        array[(*idx)++] = rec->ip;
+                                        if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+                                                goto out;
+                                }
+                        } else {
+                                if (exists) {
+                                        array[i] = array[--(*idx)];
+                                        array[*idx] = 0;
+                                        fail = 0;
+                                }
+                        }
                }
        } while_for_each_ftrace_rec();
+out:
        mutex_unlock(&ftrace_lock);
-        return found ? 0 : -EINVAL;
+        if (fail)
+                return -EINVAL;
+        ftrace_graph_filter_enabled = 1;
+        return 0;
 }
 static ssize_t
@@ -2570,16 +2587,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
        struct trace_parser parser;
        ssize_t read, ret;
-        if (!cnt || cnt < 0)
+        if (!cnt)
                return 0;
        mutex_lock(&graph_lock);
-        if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
-                ret = -EBUSY;
-                goto out_unlock;
-        }
        if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
                ret = -ENOMEM;
                goto out_unlock;
@@ -3222,8 +3234,8 @@ free:
 }
 static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+ftrace_graph_probe_sched_switch(void *ignore,
-                                struct task_struct *next)
+                        struct task_struct *prev, struct task_struct *next)
 {
        unsigned long long timestamp;
        int index;
@@ -3277,7 +3289,7 @@ static int start_graph_tracing(void)
        } while (ret == -EAGAIN);
        if (!ret) {
-                ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+                ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
                if (ret)
                        pr_info("ftrace_graph: Couldn't activate tracepoint"
                                " probe to kernel_sched_switch\n");
@@ -3349,11 +3361,11 @@ void unregister_ftrace_graph(void)
                goto out;
        ftrace_graph_active--;
-        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
        ftrace_shutdown(FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
+        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
 out:
        mutex_unlock(&ftrace_lock);
@@ -3364,6 +3376,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 {
        /* Make sure we do not use the parent ret_stack */
        t->ret_stack = NULL;
+        t->curr_ret_stack = -1;
        if (ftrace_graph_active) {
                struct ftrace_ret_stack *ret_stack;
@@ -3373,7 +3386,6 @@ void ftrace_graph_init_task(struct task_struct *t)
                                GFP_KERNEL);
                if (!ret_stack)
                        return;
-                t->curr_ret_stack = -1;
                atomic_set(&t->tracing_graph_pause, 0);
                atomic_set(&t->trace_overrun, 0);
                t->ftrace_timestamp = 0;
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
        trace_wake_up();
 }
-static void kmemtrace_kmalloc(unsigned long call_site,
+static void kmemtrace_kmalloc(void *ignore,
+                              unsigned long call_site,
                              const void *ptr,
                              size_t bytes_req,
                              size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, -1);
 }
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc(void *ignore,
+                                       unsigned long call_site,
                                       const void *ptr,
                                       size_t bytes_req,
                                       size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, -1);
 }
-static void kmemtrace_kmalloc_node(unsigned long call_site,
+static void kmemtrace_kmalloc_node(void *ignore,
+                                   unsigned long call_site,
                                   const void *ptr,
                                   size_t bytes_req,
                                   size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, node);
 }
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc_node(void *ignore,
+                                            unsigned long call_site,
                                            const void *ptr,
                                            size_t bytes_req,
                                            size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, node);
 }
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+static void
+kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
 {
        kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
 }
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+static void kmemtrace_kmem_cache_free(void *ignore,
+                                      unsigned long call_site, const void *ptr)
 {
        kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
 }
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
 {
        int err;
-        err = register_trace_kmalloc(kmemtrace_kmalloc);
+        err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
        if (err)
                return err;
-        err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+        err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
        if (err)
                return err;
-        err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+        err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
        if (err)
                return err;
-        err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+        err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
        if (err)
                return err;
-        err = register_trace_kfree(kmemtrace_kfree);
+        err = register_trace_kfree(kmemtrace_kfree, NULL);
        if (err)
                return err;
-        err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+        err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
        return err;
 }
 static void kmemtrace_stop_probes(void)
 {
-        unregister_trace_kmalloc(kmemtrace_kmalloc);
+        unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
-        unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+        unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-        unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+        unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-        unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+        unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-        unregister_trace_kfree(kmemtrace_kfree);
+        unregister_trace_kfree(kmemtrace_kfree, NULL);
-        unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+        unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
 }
 static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
 };
 static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
+                      struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
 }
 static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
+kmemtrace_print_free(struct trace_iterator *iter, int flags,
+                     struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
 }
 static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
+                           struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
 }
 static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
+                          struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
        }
 }
-static struct trace_event kmem_trace_alloc = {
+static struct trace_event_functions kmem_trace_alloc_funcs = {
-        .type                   = TRACE_KMEM_ALLOC,
        .trace                  = kmemtrace_print_alloc,
        .binary                 = kmemtrace_print_alloc_user,
 };
-static struct trace_event kmem_trace_free = {
+static struct trace_event kmem_trace_alloc = {
-        .type                   = TRACE_KMEM_FREE,
+        .type                   = TRACE_KMEM_ALLOC,
+        .funcs                  = &kmem_trace_alloc_funcs,
+};
+static struct trace_event_functions kmem_trace_free_funcs = {
        .trace                  = kmemtrace_print_free,
        .binary                 = kmemtrace_print_free_user,
 };
+static struct trace_event kmem_trace_free = {
+        .type                   = TRACE_KMEM_FREE,
+        .funcs                  = &kmem_trace_free_funcs,
+};
 static struct tracer kmem_tracer __read_mostly = {
        .name                   = "kmemtrace",
        .init                   = kmem_trace_init,
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e6..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801b..1da7b6ea8b85 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
+#include <asm/local.h>
 #include "trace.h"
 /*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+# define RB_FORCE_8BYTE_ALIGNMENT       0
+# define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT       1
+# define RB_ARCH_ALIGNMENT              8U
+#endif
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -309,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK         ((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST   (~TS_MASK)
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS        (1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED        (1 << 30)
 struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
@@ -328,6 +343,7 @@ struct buffer_page {
        local_t          write;         /* index for next write */
        unsigned         read;          /* index for next read */
        local_t          entries;       /* entries on this page */
+        unsigned long    real_end;      /* real end of data */
        struct buffer_data_page *page;  /* Actual data page */
 };
@@ -407,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
                               (unsigned int)sizeof(field.commit),
                               (unsigned int)is_signed_type(long));
+        ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
+                               (unsigned int)offsetof(typeof(field), commit),
+                               1,
+                               (unsigned int)is_signed_type(long));
        ret = trace_seq_printf(s, "\tfield: char data;\t"
                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), data),
@@ -430,6 +452,8 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *tail_page;     /* write to tail */
        struct buffer_page              *commit_page;   /* committed pages */
        struct buffer_page              *reader_page;
+        unsigned long                   lost_events;
+        unsigned long                   last_overrun;
        local_t                         commit_overrun;
        local_t                         overrun;
        local_t                         entries;
@@ -464,6 +488,8 @@ struct ring_buffer_iter {
        struct ring_buffer_per_cpu      *cpu_buffer;
        unsigned long                   head;
        struct buffer_page              *head_page;
+        struct buffer_page              *cache_reader_page;
+        unsigned long                   cache_read;
        u64                             read_stamp;
 };
@@ -1198,18 +1224,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                        return;
+                        goto out;
                p = cpu_buffer->pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
                free_buffer_page(bpage);
        }
        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                return;
+                goto out;
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
+out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
@@ -1226,7 +1253,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
-                        return;
+                        goto out;
                p = pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
@@ -1235,6 +1262,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
+out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
@@ -1544,7 +1572,7 @@ rb_update_event(struct ring_buffer_event *event,
        case 0:
                length -= RB_EVNT_HDR_SIZE;
-                if (length > RB_MAX_SMALL_DATA)
+                if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                        event->array[0] = length;
                else
                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1719,11 +1747,11 @@ static unsigned rb_calculate_event_length(unsigned length)
        if (!length)
                length = 1;
-        if (length > RB_MAX_SMALL_DATA)
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                length += sizeof(event.array[0]);
        length += RB_EVNT_HDR_SIZE;
-        length = ALIGN(length, RB_ALIGNMENT);
+        length = ALIGN(length, RB_ARCH_ALIGNMENT);
        return length;
 }
@@ -1740,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
         * must fill the old tail_page with padding.
         */
        if (tail >= BUF_PAGE_SIZE) {
+                /*
+                 * If the page was filled, then we still need
+                 * to update the real_end. Reset it to zero
+                 * and the reader will ignore it.
+                 */
+                if (tail == BUF_PAGE_SIZE)
+                        tail_page->real_end = 0;
                local_sub(length, &tail_page->write);
                return;
        }
@@ -1748,6 +1784,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        kmemcheck_annotate_bitfield(event, bitfield);
        /*
+         * Save the original length to the meta data.
+         * This will be used by the reader to add lost event
+         * counter.
+         */
+        tail_page->real_end = tail;
+        /*
         * If this event is bigger than the minimum size, then
         * we need to be careful that we don't subtract the
         * write counter enough to allow another writer to slip
@@ -1965,17 +2008,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                  u64 *ts, u64 *delta)
 {
        struct ring_buffer_event *event;
-        static int once;
        int ret;
-        if (unlikely(*delta > (1ULL << 59) && !once++)) {
+        WARN_ONCE(*delta > (1ULL << 59),
-                printk(KERN_WARNING "Delta way too big! %llu"
+                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                       " ts=%llu write stamp = %llu\n",
+                  (unsigned long long)*delta,
-                       (unsigned long long)*delta,
+                  (unsigned long long)*ts,
-                       (unsigned long long)*ts,
+                  (unsigned long long)cpu_buffer->write_stamp);
-                       (unsigned long long)cpu_buffer->write_stamp);
-                WARN_ON(1);
-        }
        /*
         * The delta is too big, we to add a
@@ -2230,12 +2269,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return NULL;
-        if (atomic_read(&buffer->record_disabled))
-                return NULL;
        /* If we are tracing schedule, we don't want to recurse */
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out_nocheck;
        if (trace_recursive_lock())
                goto out_nocheck;
@@ -2467,11 +2506,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return -EBUSY;
-        if (atomic_read(&buffer->record_disabled))
-                return -EBUSY;
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out;
        cpu = raw_smp_processor_id();
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2539,7 +2578,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
 * @buffer: The ring buffer to enable writes
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable(struct ring_buffer *buffer)
 {
@@ -2575,7 +2614,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 * @cpu: The CPU to enable.
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
@@ -2716,6 +2755,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
+        iter->cache_reader_page = cpu_buffer->reader_page;
+        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -2822,6 +2863,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
        struct buffer_page *reader = NULL;
+        unsigned long overwrite;
        unsigned long flags;
        int nr_loops = 0;
        int ret;
@@ -2863,6 +2905,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        local_set(&cpu_buffer->reader_page->write, 0);
        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
+        cpu_buffer->reader_page->real_end = 0;
 spin:
        /*
@@ -2883,6 +2926,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
        /*
+         * We want to make sure we read the overruns after we set up our
+         * pointers to the next object. The writer side does a
+         * cmpxchg to cross pages which acts as the mb on the writer
+         * side. Note, the reader will constantly fail the swap
+         * while the writer is updating the pointers, so this
+         * guarantees that the overwrite recorded here is the one we
+         * want to compare with the last_overrun.
+         */
+        smp_mb();
+        overwrite = local_read(&(cpu_buffer->overrun));
+        /*
         * Here's the tricky part.
         *
         * We need to move the pointer past the header page.
@@ -2913,6 +2968,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page = reader;
        rb_reset_reader_page(cpu_buffer);
+        if (overwrite != cpu_buffer->last_overrun) {
+                cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+                cpu_buffer->last_overrun = overwrite;
+        }
        goto again;
 out:
@@ -2989,8 +3049,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
                rb_advance_iter(iter);
 }
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        return cpu_buffer->lost_events;
+}
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+               unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
        struct buffer_page *reader;
@@ -3042,6 +3108,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
                                                         cpu_buffer->cpu, ts);
                }
+                if (lost_events)
+                        *lost_events = rb_lost_events(cpu_buffer);
                return event;
        default:
@@ -3060,13 +3128,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_event *event;
        int nr_loops = 0;
-        if (ring_buffer_iter_empty(iter))
-                return NULL;
        cpu_buffer = iter->cpu_buffer;
        buffer = cpu_buffer->buffer;
+        /*
+         * Check if someone performed a consuming read to
+         * the buffer. A consuming read invalidates the iterator
+         * and we need to reset the iterator in this case.
+         */
+        if (unlikely(iter->cache_read != cpu_buffer->read ||
+                     iter->cache_reader_page != cpu_buffer->reader_page))
+                rb_iter_reset(iter);
 again:
+        if (ring_buffer_iter_empty(iter))
+                return NULL;
        /*
         * We repeat when a timestamp is encountered.
         * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3158,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
+        if (iter->head >= local_read(&iter->head_page->page->commit)) {
+                rb_inc_iter(iter);
+                goto again;
+        }
        event = rb_iter_head_event(iter);
        switch (event->type_len) {
@@ -3138,12 +3220,14 @@ static inline int rb_ok_to_lock(void)
 * @buffer: The ring buffer to read
 * @cpu: The cpu to peak at
 * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
 *
 * This will return the event that will be read next, but does
 * not consume the data.
 */
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+                 unsigned long *lost_events)
 {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
@@ -3158,7 +3242,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        local_irq_save(flags);
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
-        event = rb_buffer_peek(cpu_buffer, ts);
+        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                rb_advance_reader(cpu_buffer);
        if (dolock)
@@ -3200,13 +3284,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 /**
 * ring_buffer_consume - return an event and consume it
 * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
 *
 * Returns the next event in the ring buffer, and that event is consumed.
 * Meaning, that sequential reads will keep returning a different event,
 * and eventually empty the ring buffer if the producer is slower.
 */
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+                    unsigned long *lost_events)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event = NULL;
@@ -3227,9 +3315,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
-        event = rb_buffer_peek(cpu_buffer, ts);
+        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
-        if (event)
+        if (event) {
+                cpu_buffer->lost_events = 0;
                rb_advance_reader(cpu_buffer);
+        }
        if (dolock)
                spin_unlock(&cpu_buffer->reader_lock);
@@ -3246,23 +3336,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 /**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
 * @buffer: The ring buffer to read from
 * @cpu: The cpu buffer to iterate over
 *
- * This starts up an iteration through the buffer. It also disables
+ * This performs the initial preparations necessary to iterate
- * the recording to the buffer until the reading is finished.
+ * through the buffer.  Memory is allocated, buffer recording
- * This prevents the reading from being corrupted. This is not
+ * is disabled, and the iterator pointer is returned to the caller.
- * a consuming read, so a producer is not expected.
 *
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
 */
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_iter *iter;
-        unsigned long flags;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return NULL;
@@ -3276,15 +3373,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
        iter->cpu_buffer = cpu_buffer;
        atomic_inc(&cpu_buffer->record_disabled);
+        return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized.  Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
        synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long flags;
+        if (!iter)
+                return;
+        cpu_buffer = iter->cpu_buffer;
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
        arch_spin_unlock(&cpu_buffer->lock);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-        return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -3378,6 +3512,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
+        cpu_buffer->lost_events = 0;
+        cpu_buffer->last_overrun = 0;
        rb_head_page_activate(cpu_buffer);
 }
@@ -3653,6 +3790,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
        struct buffer_data_page *bpage;
        struct buffer_page *reader;
+        unsigned long missed_events;
        unsigned long flags;
        unsigned int commit;
        unsigned int read;
@@ -3689,6 +3827,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        read = reader->read;
        commit = rb_page_commit(reader);
+        /* Check if any events were dropped */
+        missed_events = cpu_buffer->lost_events;
        /*
         * If this page has been partially read or
         * if len is not big enough to read the rest of the page or
@@ -3749,9 +3890,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                local_set(&reader->entries, 0);
                reader->read = 0;
                *data_page = bpage;
+                /*
+                 * Use the real_end for the data size,
+                 * This gives us a chance to store the lost events
+                 * on the page.
+                 */
+                if (reader->real_end)
+                        local_set(&bpage->commit, reader->real_end);
        }
        ret = read;
+        cpu_buffer->lost_events = 0;
+        commit = local_read(&bpage->commit);
+        /*
+         * Set a flag in the commit field if we lost events
+         */
+        if (missed_events) {
+                /* If there is room at the end of the page to save the
+                 * missed events, then record it there.
+                 */
+                if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+                        memcpy(&bpage->data[commit], &missed_events,
+                               sizeof(missed_events));
+                        local_add(RB_MISSED_STORED, &bpage->commit);
+                        commit += sizeof(missed_events);
+                }
+                local_add(RB_MISSED_EVENTS, &bpage->commit);
+        }
+        /*
+         * This page may be off to user land. Zero it out here.
+         */
+        if (commit < BUF_PAGE_SIZE)
+                memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
 out_unlock:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/time.h>
+#include <asm/local.h>
 struct rb_page {
        u64             ts;
@@ -80,7 +81,7 @@ static enum event_status read_event(int cpu)
        int *entry;
        u64 ts;
-        event = ring_buffer_consume(buffer, cpu, &ts);
+        event = ring_buffer_consume(buffer, cpu, &ts, NULL);
        if (!event)
                return EVENT_DROPPED;
@@ -112,7 +113,8 @@ static enum event_status read_page(int cpu)
        ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
        if (ret >= 0) {
                rpage = bpage;
-                commit = local_read(&rpage->commit);
+                /* The commit may have missed event flags set, clear them */
+                commit = local_read(&rpage->commit) & 0xfffff;
                for (i = 0; i < commit && !kill_test; i += inc) {
                        if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9e..086d36316805 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,10 +32,11 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-#include <linux/gfp.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
 static inline void ftrace_disable_cpu(void)
 {
        preempt_disable();
-        __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_inc(ftrace_cpu_disabled);
 }
 static inline void ftrace_enable_cpu(void)
 {
-        __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_dec(ftrace_cpu_disabled);
        preempt_enable();
 }
 static cpumask_var_t __read_mostly      tracing_buffer_mask;
-/* Define which cpu buffers are currently read in trace_pipe */
-static cpumask_var_t                    tracing_reader_cpumask;
 #define for_each_tracing_cpu(cpu)       \
        for_each_cpu(cpu, tracing_buffer_mask)
@@ -119,9 +117,12 @@ static cpumask_var_t			tracing_reader_cpumask;
 *
 * It is default off, but you can enable it with either specifying
 * "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
 */
-int ftrace_dump_on_oops;
+enum ftrace_dump_mode ftrace_dump_on_oops;
 static int tracing_set_tracer(const char *buf);
@@ -141,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-        ftrace_dump_on_oops = 1;
+        if (*str++ != '=' || !*str) {
-        return 1;
+                ftrace_dump_on_oops = DUMP_ALL;
+                return 1;
+        }
+        if (!strcmp("orig_cpu", str)) {
+                ftrace_dump_on_oops = DUMP_ORIG;
+                return 1;
+        }
+        return 0;
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
@@ -243,12 +253,91 @@ static struct tracer		*current_trace __read_mostly;
 /*
 * trace_types_lock is used to protect the trace_types list.
- * This lock is also used to keep user access serialized.
- * Accesses from userspace will grab this lock while userspace
- * activities happen inside the kernel.
 */
 static DEFINE_MUTEX(trace_types_lock);
+/*
+ * serialize the access of the ring buffer
+ *
+ * ring buffer serializes readers, but it is low level protection.
+ * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ * are not protected by ring buffer.
+ *
+ * The content of events may become garbage if we allow other process consumes
+ * these events concurrently:
+ *   A) the page of the consumed events may become a normal page
+ *      (not reader page) in ring buffer, and this page will be rewrited
+ *      by events producer.
+ *   B) The page of the consumed events may become a page for splice_read,
+ *      and this page will be returned to system.
+ *
+ * These primitives allow multi process access to different cpu ring buffer
+ * concurrently.
+ *
+ * These primitives don't distinguish read-only and read-consume access.
+ * Multi read-only access are also serialized.
+ */
+#ifdef CONFIG_SMP
+static DECLARE_RWSEM(all_cpu_access_lock);
+static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                /* gain it for accessing the whole ring buffer. */
+                down_write(&all_cpu_access_lock);
+        } else {
+                /* gain it for accessing a cpu ring buffer. */
+                /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+                down_read(&all_cpu_access_lock);
+                /* Secondly block other access to this @cpu ring buffer. */
+                mutex_lock(&per_cpu(cpu_access_lock, cpu));
+        }
+}
+static inline void trace_access_unlock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                up_write(&all_cpu_access_lock);
+        } else {
+                mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+                up_read(&all_cpu_access_lock);
+        }
+}
+static inline void trace_access_lock_init(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                mutex_init(&per_cpu(cpu_access_lock, cpu));
+}
+#else
+static DEFINE_MUTEX(access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        (void)cpu;
+        mutex_lock(&access_lock);
+}
+static inline void trace_access_unlock(int cpu)
+{
+        (void)cpu;
+        mutex_unlock(&access_lock);
+}
+static inline void trace_access_lock_init(void)
+{
+}
+#endif
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
@@ -297,6 +386,21 @@ static int __init set_buf_size(char *str)
 }
 __setup("trace_buf_size=", set_buf_size);
+static int __init set_tracing_thresh(char *str)
+{
+        unsigned long threshhold;
+        int ret;
+        if (!str)
+                return 0;
+        ret = strict_strtoul(str, 0, &threshhold);
+        if (ret < 0)
+                return 0;
+        tracing_thresh = threshhold * 1000;
+        return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
 unsigned long nsecs_to_usecs(unsigned long nsecs)
 {
        return nsecs / 1000;
@@ -502,9 +606,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 static arch_spinlock_t ftrace_max_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+unsigned long __read_mostly     tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 unsigned long __read_mostly     tracing_max_latency;
-unsigned long __read_mostly     tracing_thresh;
 /*
 * Copy the new maximum trace into the separate maximum-trace
@@ -515,7 +620,7 @@ static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
        struct trace_array_cpu *data = tr->data[cpu];
-        struct trace_array_cpu *max_data = tr->data[cpu];
+        struct trace_array_cpu *max_data;
        max_tr.cpu = cpu;
        max_tr.time_start = data->preempt_timestamp;
@@ -525,7 +630,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;
-        memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+        memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
        max_data->pid = tsk->pid;
        max_data->uid = task_uid(tsk);
        max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -747,10 +852,10 @@ out:
        mutex_unlock(&trace_types_lock);
 }
-static void __tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
 {
        ftrace_disable_cpu();
-        ring_buffer_reset_cpu(tr->buffer, cpu);
+        ring_buffer_reset_cpu(buffer, cpu);
        ftrace_enable_cpu();
 }
@@ -762,7 +867,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
        /* Make sure all commits have finished */
        synchronize_sched();
-        __tracing_reset(tr, cpu);
+        __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -780,7 +885,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        tr->time_start = ftrace_now(tr->cpu);
        for_each_online_cpu(cpu)
-                __tracing_reset(tr, cpu);
+                __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -857,6 +962,8 @@ void tracing_start(void)
                goto out;
        }
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
@@ -866,6 +973,8 @@ void tracing_start(void)
        if (buffer)
                ring_buffer_record_enable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
        ftrace_start();
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -887,6 +996,9 @@ void tracing_stop(void)
        if (trace_stop_count++)
                goto out;
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
@@ -895,6 +1007,8 @@ void tracing_stop(void)
        if (buffer)
                ring_buffer_record_disable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
@@ -951,6 +1065,11 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
+        if (WARN_ON_ONCE(pid < 0)) {
+                strcpy(comm, "<XXX>");
+                return;
+        }
        if (pid > PID_MAX_DEFAULT) {
                strcpy(comm, "<...>");
                return;
@@ -1084,7 +1203,7 @@ trace_function(struct trace_array *tr,
        struct ftrace_entry *entry;
        /* If we are reading the ring buffer, don't trace */
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1177,6 +1296,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
                return;
+        /*
+         * NMIs can not handle page faults, even with fix ups.
+         * The save user stack can (and often does) fault.
+         */
+        if (unlikely(in_nmi()))
+                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
@@ -1315,8 +1441,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        entry->fmt                      = fmt;
        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, flags, 6, pc);
+        }
 out_unlock:
        arch_spin_unlock(&trace_buf_lock);
@@ -1389,8 +1517,10 @@ int trace_array_vprintk(struct trace_array *tr,
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = '\0';
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, irq_flags, 6, pc);
+        }
 out_unlock:
        arch_spin_unlock(&trace_buf_lock);
@@ -1427,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 }
 static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+                unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1438,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
        else
-                event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+                event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+                                         lost_events);
        ftrace_enable_cpu();
@@ -1446,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 }
 static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+                  unsigned long *missing_events, u64 *ent_ts)
 {
        struct ring_buffer *buffer = iter->tr->buffer;
        struct trace_entry *ent, *next = NULL;
+        unsigned long lost_events = 0, next_lost = 0;
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
@@ -1462,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
        if (cpu_file > TRACE_PIPE_ALL_CPU) {
                if (ring_buffer_empty_cpu(buffer, cpu_file))
                        return NULL;
-                ent = peek_next_entry(iter, cpu_file, ent_ts);
+                ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
                if (ent_cpu)
                        *ent_cpu = cpu_file;
@@ -1474,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
                if (ring_buffer_empty_cpu(buffer, cpu))
                        continue;
-                ent = peek_next_entry(iter, cpu, &ts);
+                ent = peek_next_entry(iter, cpu, &ts, &lost_events);
                /*
                 * Pick the entry with the smallest timestamp:
@@ -1483,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
                        next = ent;
                        next_cpu = cpu;
                        next_ts = ts;
+                        next_lost = lost_events;
                }
        }
@@ -1492,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
        if (ent_ts)
                *ent_ts = next_ts;
+        if (missing_events)
+                *missing_events = next_lost;
        return next;
 }
@@ -1499,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts)
 {
-        return __find_next_entry(iter, ent_cpu, ent_ts);
+        return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
 }
 /* Find the next real entry, and increment the iterator to the next entry */
 static void *find_next_entry_inc(struct trace_iterator *iter)
 {
-        iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+        iter->ent = __find_next_entry(iter, &iter->cpu,
+                                      &iter->lost_events, &iter->ts);
        if (iter->ent)
                trace_iterator_increment(iter);
@@ -1517,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
 {
        /* Don't allow ftrace to trace into the ring buffers */
        ftrace_disable_cpu();
-        ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+        ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+                            &iter->lost_events);
        ftrace_enable_cpu();
 }
@@ -1580,12 +1720,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 }
 /*
- * No necessary locking here. The worst thing which can
- * happen is loosing events consumed at the same time
- * by a trace_pipe reader.
- * Other than that, we don't risk to crash the ring buffer
- * because it serializes the readers.
- *
 * The current tracer is copied to avoid a global locking
 * all around.
 */
@@ -1623,6 +1757,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                ftrace_enable_cpu();
+                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;
@@ -1640,12 +1775,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
        }
        trace_event_read_lock();
+        trace_access_lock(cpu_file);
        return p;
 }
 static void s_stop(struct seq_file *m, void *p)
 {
+        struct trace_iterator *iter = m->private;
        atomic_dec(&trace_record_cmdline_disabled);
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
 }
@@ -1669,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
 }
-static void
+void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1797,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
        }
        if (event)
-                return event->trace(iter, sym_flags);
+                return event->funcs->trace(iter, sym_flags, event);
        if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
                goto partial;
@@ -1823,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
        event = ftrace_find_event(entry->type);
        if (event)
-                return event->raw(iter, 0);
+                return event->funcs->raw(iter, 0, event);
        if (!trace_seq_printf(s, "%d ?\n", entry->type))
                goto partial;
@@ -1850,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
        event = ftrace_find_event(entry->type);
        if (event) {
-                enum print_line_t ret = event->hex(iter, 0);
+                enum print_line_t ret = event->funcs->hex(iter, 0, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }
@@ -1875,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
        }
        event = ftrace_find_event(entry->type);
-        return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+        return event ? event->funcs->binary(iter, 0, event) :
+                TRACE_TYPE_HANDLED;
 }
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
 {
        int cpu;
@@ -1913,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
+        if (iter->lost_events)
+                trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+                                 iter->cpu, iter->lost_events);
        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
                if (ret != TRACE_TYPE_UNHANDLED)
@@ -1941,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
        return print_trace_fmt(iter);
 }
+void trace_default_header(struct seq_file *m)
+{
+        struct trace_iterator *iter = m->private;
+        if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+                /* print nothing if the buffers are empty */
+                if (trace_empty(iter))
+                        return;
+                print_trace_header(m, iter);
+                if (!(trace_flags & TRACE_ITER_VERBOSE))
+                        print_lat_help_header(m);
+        } else {
+                if (!(trace_flags & TRACE_ITER_VERBOSE))
+                        print_func_help_header(m);
+        }
+}
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -1953,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
                }
                if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
-                else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+                else
-                        /* print nothing if the buffers are empty */
+                        trace_default_header(m);
-                        if (trace_empty(iter))
-                                return 0;
-                        print_trace_header(m, iter);
-                        if (!(trace_flags & TRACE_ITER_VERBOSE))
-                                print_lat_help_header(m);
-                } else {
-                        if (!(trace_flags & TRACE_ITER_VERBOSE))
-                                print_func_help_header(m);
-                }
        } else if (iter->leftover) {
                /*
                 * If we filled the seq_file buffer earlier, we
@@ -2049,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
        if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
                for_each_tracing_cpu(cpu) {
                        iter->buffer_iter[cpu] =
-                                ring_buffer_read_start(iter->tr->buffer, cpu);
+                                ring_buffer_read_prepare(iter->tr->buffer, cpu);
+                }
+                ring_buffer_read_prepare_sync();
+                for_each_tracing_cpu(cpu) {
+                        ring_buffer_read_start(iter->buffer_iter[cpu]);
                        tracing_iter_reset(iter, cpu);
                }
        } else {
                cpu = iter->cpu_file;
                iter->buffer_iter[cpu] =
-                                ring_buffer_read_start(iter->tr->buffer, cpu);
+                        ring_buffer_read_prepare(iter->tr->buffer, cpu);
+                ring_buffer_read_prepare_sync();
+                ring_buffer_read_start(iter->buffer_iter[cpu]);
                tracing_iter_reset(iter, cpu);
        }
@@ -2836,22 +2994,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        mutex_lock(&trace_types_lock);
-        /* We only allow one reader per cpu */
-        if (cpu_file == TRACE_PIPE_ALL_CPU) {
-                if (!cpumask_empty(tracing_reader_cpumask)) {
-                        ret = -EBUSY;
-                        goto out;
-                }
-                cpumask_setall(tracing_reader_cpumask);
-        } else {
-                if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
-                        cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
-                else {
-                        ret = -EBUSY;
-                        goto out;
-                }
-        }
        /* create a buffer to store the information to pass to userspace */
        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
@@ -2907,12 +3049,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        mutex_lock(&trace_types_lock);
-        if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
-                cpumask_clear(tracing_reader_cpumask);
-        else
-                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
        if (iter->trace->pipe_close)
                iter->trace->pipe_close(iter);
@@ -3074,6 +3210,7 @@ waitagain:
        iter->pos = -1;
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        while (find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int len = iter->seq.len;
@@ -3090,6 +3227,7 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        /* Now copy what we have to the user */
@@ -3172,12 +3310,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                                        size_t len,
                                        unsigned int flags)
 {
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages_def[PIPE_DEF_BUFFERS];
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial_def[PIPE_DEF_BUFFERS];
        struct trace_iterator *iter = filp->private_data;
        struct splice_pipe_desc spd = {
-                .pages          = pages,
+                .pages          = pages_def,
-                .partial        = partial,
+                .partial        = partial_def,
                .nr_pages       = 0, /* This gets updated below. */
                .flags          = flags,
                .ops            = &tracing_pipe_buf_ops,
@@ -3188,6 +3326,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        size_t rem;
        unsigned int i;
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
        /* copy the tracer to avoid using a global lock all around */
        mutex_lock(&trace_types_lock);
        if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3215,40 +3356,44 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        }
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        /* Fill as many pages as possible. */
-        for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
+        for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
-                pages[i] = alloc_page(GFP_KERNEL);
+                spd.pages[i] = alloc_page(GFP_KERNEL);
-                if (!pages[i])
+                if (!spd.pages[i])
                        break;
                rem = tracing_fill_pipe_page(rem, iter);
                /* Copy the data into the page, so we can start over. */
                ret = trace_seq_to_buffer(&iter->seq,
-                                          page_address(pages[i]),
+                                          page_address(spd.pages[i]),
                                          iter->seq.len);
                if (ret < 0) {
-                        __free_page(pages[i]);
+                        __free_page(spd.pages[i]);
                        break;
                }
-                partial[i].offset = 0;
+                spd.partial[i].offset = 0;
-                partial[i].len = iter->seq.len;
+                spd.partial[i].len = iter->seq.len;
                trace_seq_init(&iter->seq);
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        mutex_unlock(&iter->mutex);
        spd.nr_pages = i;
-        return splice_to_pipe(pipe, &spd);
+        ret = splice_to_pipe(pipe, &spd);
+out:
+        splice_shrink_spd(pipe, &spd);
+        return ret;
 out_err:
        mutex_unlock(&iter->mutex);
+        goto out;
-        return ret;
 }
 static ssize_t
@@ -3521,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
 {
        struct ftrace_buffer_info *info = filp->private_data;
-        unsigned int pos;
        ssize_t ret;
        size_t size;
@@ -3539,18 +3683,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        info->read = 0;
+        trace_access_lock(info->cpu);
        ret = ring_buffer_read_page(info->tr->buffer,
                                    &info->spare,
                                    count,
                                    info->cpu, 0);
+        trace_access_unlock(info->cpu);
        if (ret < 0)
                return 0;
-        pos = ring_buffer_page_len(info->spare);
-        if (pos < PAGE_SIZE)
-                memset(info->spare + pos, 0, PAGE_SIZE - pos);
 read:
        size = PAGE_SIZE - info->read;
        if (size > count)
@@ -3645,11 +3786,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                            unsigned int flags)
 {
        struct ftrace_buffer_info *info = file->private_data;
-        struct partial_page partial[PIPE_BUFFERS];
+        struct partial_page partial_def[PIPE_DEF_BUFFERS];
-        struct page *pages[PIPE_BUFFERS];
+        struct page *pages_def[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
-                .pages          = pages,
+                .pages          = pages_def,
-                .partial        = partial,
+                .partial        = partial_def,
                .flags          = flags,
                .ops            = &buffer_pipe_buf_ops,
                .spd_release    = buffer_spd_release,
@@ -3658,21 +3799,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        int entries, size, i;
        size_t ret;
+        if (splice_grow_spd(pipe, &spd))
+                return -ENOMEM;
        if (*ppos & (PAGE_SIZE - 1)) {
                WARN_ONCE(1, "Ftrace: previous read must page-align\n");
-                return -EINVAL;
+                ret = -EINVAL;
+                goto out;
        }
        if (len & (PAGE_SIZE - 1)) {
                WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
-                if (len < PAGE_SIZE)
+                if (len < PAGE_SIZE) {
-                        return -EINVAL;
+                        ret = -EINVAL;
+                        goto out;
+                }
                len &= PAGE_MASK;
        }
+        trace_access_lock(info->cpu);
        entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
-        for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+        for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
                struct page *page;
                int r;
@@ -3717,6 +3865,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        }
+        trace_access_unlock(info->cpu);
        spd.nr_pages = i;
        /* did we read anything? */
@@ -3726,11 +3875,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                else
                        ret = 0;
                /* TODO: block */
-                return ret;
+                goto out;
        }
        ret = splice_to_pipe(pipe, &spd);
+        splice_shrink_spd(pipe, &spd);
+out:
        return ret;
 }
@@ -4153,6 +4303,8 @@ static __init int tracer_init_debugfs(void)
        struct dentry *d_tracer;
        int cpu;
+        trace_access_lock_init();
        d_tracer = tracing_init_dentry();
        trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4328,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
        trace_create_file("tracing_max_latency", 0644, d_tracer,
                        &tracing_max_latency, &tracing_max_lat_fops);
+#endif
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &tracing_thresh, &tracing_max_lat_fops);
-#endif
        trace_create_file("README", 0444, d_tracer,
                        NULL, &tracing_readme_fops);
@@ -4219,7 +4371,7 @@ static int trace_panic_handler(struct notifier_block *this,
                               unsigned long event, void *unused)
 {
        if (ftrace_dump_on_oops)
-                ftrace_dump();
+                ftrace_dump(ftrace_dump_on_oops);
        return NOTIFY_OK;
 }
@@ -4236,7 +4388,7 @@ static int trace_die_handler(struct notifier_block *self,
        switch (val) {
        case DIE_OOPS:
                if (ftrace_dump_on_oops)
-                        ftrace_dump();
+                        ftrace_dump(ftrace_dump_on_oops);
                break;
        default:
                break;
@@ -4277,7 +4429,8 @@ trace_printk_seq(struct trace_seq *s)
        trace_seq_init(s);
 }
-static void __ftrace_dump(bool disable_tracing)
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
        static arch_spinlock_t ftrace_dump_lock =
                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4310,12 +4463,25 @@ static void __ftrace_dump(bool disable_tracing)
        /* don't look at user memory in panic mode */
        trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
-        printk(KERN_TRACE "Dumping ftrace buffer:\n");
        /* Simulate the iterator */
        iter.tr = &global_trace;
        iter.trace = current_trace;
-        iter.cpu_file = TRACE_PIPE_ALL_CPU;
+        switch (oops_dump_mode) {
+        case DUMP_ALL:
+                iter.cpu_file = TRACE_PIPE_ALL_CPU;
+                break;
+        case DUMP_ORIG:
+                iter.cpu_file = raw_smp_processor_id();
+                break;
+        case DUMP_NONE:
+                goto out_enable;
+        default:
+                printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+                iter.cpu_file = TRACE_PIPE_ALL_CPU;
+        }
+        printk(KERN_TRACE "Dumping ftrace buffer:\n");
        /*
         * We need to stop all tracing on all CPUS to read the
@@ -4354,6 +4520,7 @@ static void __ftrace_dump(bool disable_tracing)
        else
                printk(KERN_TRACE "---------------------------------\n");
+ out_enable:
        /* Re-enable tracing if requested */
        if (!disable_tracing) {
                trace_flags |= old_userobj;
@@ -4370,9 +4537,9 @@ static void __ftrace_dump(bool disable_tracing)
 }
 /* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
-        __ftrace_dump(true);
+        __ftrace_dump(true, oops_dump_mode);
 }
 __init static int tracer_alloc_buffers(void)
@@ -4387,9 +4554,6 @@ __init static int tracer_alloc_buffers(void)
        if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;
-        if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
-                goto out_free_tracing_cpumask;
        /* To save memory, keep the ring buffer size to its minimum */
        if (ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
@@ -4447,8 +4611,6 @@ __init static int tracer_alloc_buffers(void)
        return 0;
 out_free_cpumask:
-        free_cpumask_var(tracing_reader_cpumask);
-out_free_tracing_cpumask:
        free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
        TRACE_GRAPH_RET,
        TRACE_GRAPH_ENT,
        TRACE_USER_STACK,
-        TRACE_HW_BRANCHES,
        TRACE_KMEM_ALLOC,
        TRACE_KMEM_FREE,
        TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
        long                    ret;
 };
-struct kprobe_trace_entry {
+struct kprobe_trace_entry_head {
        struct trace_entry      ent;
        unsigned long           ip;
-        int                     nargs;
-        unsigned long           args[];
 };
-#define SIZEOF_KPROBE_TRACE_ENTRY(n)                    \
+struct kretprobe_trace_entry_head {
-        (offsetof(struct kprobe_trace_entry, args) +    \
-        (sizeof(unsigned long) * (n)))
-struct kretprobe_trace_entry {
        struct trace_entry      ent;
        unsigned long           func;
        unsigned long           ret_ip;
-        int                     nargs;
-        unsigned long           args[];
 };
-#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)                 \
-        (offsetof(struct kretprobe_trace_entry, args) + \
-        (sizeof(unsigned long) * (n)))
 /*
 * trace_flag_type is an enumeration that holds different
 * states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
                          TRACE_GRAPH_ENT);             \
                IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,      \
                          TRACE_GRAPH_RET);             \
-                IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
                IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,       \
                          TRACE_KMEM_ALLOC);    \
                IF_ASSIGN(var, ent, struct kmemtrace_free_entry,        \
@@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -396,9 +385,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+extern unsigned long tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 extern unsigned long tracing_max_latency;
-extern unsigned long tracing_thresh;
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
@@ -415,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc);
 #else
-static inline void ftrace_trace_stack(struct trace_array *tr,
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
                                      unsigned long flags, int skip, int pc)
 {
 }
-static inline void ftrace_trace_userstack(struct trace_array *tr,
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
                                          unsigned long flags, int pc)
 {
 }
@@ -466,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
                                               struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
-                                              struct trace_array *tr);
 extern int trace_selftest_startup_ksym(struct tracer *trace,
                                         struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -490,13 +478,34 @@ extern int trace_clock_id;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN       0x1
+#define TRACE_GRAPH_PRINT_CPU           0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
+#define TRACE_GRAPH_PRINT_PROC          0x8
+#define TRACE_GRAPH_PRINT_DURATION      0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+                                 struct ftrace_graph_ret *trace,
+                                 unsigned long flags, int pc);
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 #define FTRACE_GRAPH_MAX_FUNCS          32
+extern int ftrace_graph_filter_enabled;
 extern int ftrace_graph_count;
 extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
@@ -504,7 +513,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 {
        int i;
-        if (!ftrace_graph_count || test_tsk_trace_graph(current))
+        if (!ftrace_graph_filter_enabled)
                return 1;
        for (i = 0; i < ftrace_graph_count; i++) {
@@ -522,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        return TRACE_TYPE_UNHANDLED;
 }
@@ -549,7 +558,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
 * @buffer: holds the parsed user input
- * @idx: user input lenght
+ * @idx: user input length
 * @size: buffer size
 */
 struct trace_parser {
@@ -769,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
                                         struct trace_seq *s);
 extern int filter_assign_type(const char *type);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
                     struct ring_buffer *buffer,
                     struct ring_buffer_event *event)
 {
-        if (unlikely(call->filter_active) &&
+        if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
            !filter_match_preds(call->filter, rec)) {
                ring_buffer_discard_commit(buffer, event);
                return 1;
@@ -791,7 +803,8 @@ extern const char *__stop___trace_bprintk_fmt[];
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print)             \
-        extern struct ftrace_event_call event_##call;
+        extern struct ftrace_event_call                                 \
+        __attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)         \
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
 }
 static enum print_line_t trace_branch_print(struct trace_iterator *iter,
-                                            int flags)
+                                            int flags, struct trace_event *event)
 {
        struct trace_branch *field;
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
                "    |\n");
 }
+static struct trace_event_functions trace_branch_funcs = {
+        .trace          = trace_branch_print,
+};
 static struct trace_event trace_branch_event = {
        .type           = TRACE_BRANCH,
-        .trace          = trace_branch_print,
+        .funcs          = &trace_branch_funcs,
 };
 static struct tracer branch_trace __read_mostly =
@@ -307,8 +311,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
                return -1;
        if (percent_a > percent_b)
                return 1;
-        else
-                return 0;
+        if (a->incorrect < b->incorrect)
+                return -1;
+        if (a->incorrect > b->incorrect)
+                return 1;
+        /*
+         * Since the above shows worse (incorrect) cases
+         * first, we continue that by showing best (correct)
+         * cases last.
+         */
+        if (a->correct > b->correct)
+                return -1;
+        if (a->correct < b->correct)
+                return 1;
+        return 0;
 }
 static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
 * Tracer plugins will chose a default from these clocks.
 */
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void)
        int this_cpu;
        u64 now;
-        raw_local_irq_save(flags);
+        local_irq_save(flags);
        this_cpu = raw_smp_processor_id();
        now = cpu_clock(this_cpu);
@@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void)
        arch_spin_unlock(&trace_clock_struct.lock);
 out:
-        raw_local_irq_restore(flags);
+        local_irq_restore(flags);
        return now;
 }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
                 __entry->func, __entry->file, __entry->correct)
 );
-FTRACE_ENTRY(hw_branch, hw_branch_entry,
-        TRACE_HW_BRANCHES,
-        F_STRUCT(
-                __field(        u64,    from    )
-                __field(        u64,    to      )
-        ),
-        F_printk("from: %llx to: %llx", __entry->from, __entry->to)
-);
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
        TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 000000000000..8a2b73f7c068
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,195 @@
+/*
+ * trace event based perf event profiling/tracing
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include "trace.h"
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+static char *perf_trace_buf[4];
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+        perf_trace_t;
+/* Count the events in use (per event id, not per instance) */
+static int      total_ref_count;
+static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+                                 struct perf_event *p_event)
+{
+        struct hlist_head *list;
+        int ret = -ENOMEM;
+        int cpu;
+        p_event->tp_event = tp_event;
+        if (tp_event->perf_refcount++ > 0)
+                return 0;
+        list = alloc_percpu(struct hlist_head);
+        if (!list)
+                goto fail;
+        for_each_possible_cpu(cpu)
+                INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
+        tp_event->perf_events = list;
+        if (!total_ref_count) {
+                char *buf;
+                int i;
+                for (i = 0; i < 4; i++) {
+                        buf = (char *)alloc_percpu(perf_trace_t);
+                        if (!buf)
+                                goto fail;
+                        perf_trace_buf[i] = buf;
+                }
+        }
+        if (tp_event->class->reg)
+                ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
+        else
+                ret = tracepoint_probe_register(tp_event->name,
+                                                tp_event->class->perf_probe,
+                                                tp_event);
+        if (ret)
+                goto fail;
+        total_ref_count++;
+        return 0;
+fail:
+        if (!total_ref_count) {
+                int i;
+                for (i = 0; i < 4; i++) {
+                        free_percpu(perf_trace_buf[i]);
+                        perf_trace_buf[i] = NULL;
+                }
+        }
+        if (!--tp_event->perf_refcount) {
+                free_percpu(tp_event->perf_events);
+                tp_event->perf_events = NULL;
+        }
+        return ret;
+}
+int perf_trace_init(struct perf_event *p_event)
+{
+        struct ftrace_event_call *tp_event;
+        int event_id = p_event->attr.config;
+        int ret = -EINVAL;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(tp_event, &ftrace_events, list) {
+                if (tp_event->event.type == event_id &&
+                    tp_event->class &&
+                    (tp_event->class->perf_probe ||
+                     tp_event->class->reg) &&
+                    try_module_get(tp_event->mod)) {
+                        ret = perf_trace_event_init(tp_event, p_event);
+                        break;
+                }
+        }
+        mutex_unlock(&event_mutex);
+        return ret;
+}
+int perf_trace_enable(struct perf_event *p_event)
+{
+        struct ftrace_event_call *tp_event = p_event->tp_event;
+        struct hlist_head *list;
+        list = tp_event->perf_events;
+        if (WARN_ON_ONCE(!list))
+                return -EINVAL;
+        list = this_cpu_ptr(list);
+        hlist_add_head_rcu(&p_event->hlist_entry, list);
+        return 0;
+}
+void perf_trace_disable(struct perf_event *p_event)
+{
+        hlist_del_rcu(&p_event->hlist_entry);
+}
+void perf_trace_destroy(struct perf_event *p_event)
+{
+        struct ftrace_event_call *tp_event = p_event->tp_event;
+        int i;
+        mutex_lock(&event_mutex);
+        if (--tp_event->perf_refcount > 0)
+                goto out;
+        if (tp_event->class->reg)
+                tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
+        else
+                tracepoint_probe_unregister(tp_event->name,
+                                            tp_event->class->perf_probe,
+                                            tp_event);
+        /*
+         * Ensure our callback won't be called anymore. See
+         * tracepoint_probe_unregister() and __DO_TRACE().
+         */
+        synchronize_sched();
+        free_percpu(tp_event->perf_events);
+        tp_event->perf_events = NULL;
+        if (!--total_ref_count) {
+                for (i = 0; i < 4; i++) {
+                        free_percpu(perf_trace_buf[i]);
+                        perf_trace_buf[i] = NULL;
+                }
+        }
+out:
+        mutex_unlock(&event_mutex);
+}
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+                                       struct pt_regs *regs, int *rctxp)
+{
+        struct trace_entry *entry;
+        unsigned long flags;
+        char *raw_data;
+        int pc;
+        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+        pc = preempt_count();
+        *rctxp = perf_swevent_get_recursion_context();
+        if (*rctxp < 0)
+                return NULL;
+        raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+        /* zero the dead bytes from align to not leak stack to user */
+        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+        entry = (struct trace_entry *)raw_data;
+        local_save_flags(flags);
+        tracing_generic_entry_update(entry, flags, pc);
+        entry->type = type;
+        return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
deleted file mode 100644
index 9e25573242cf..000000000000
--- a/kernel/trace/trace_event_profile.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * trace event based perf counter profiling
- *
- * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
- */
-#include <linux/module.h>
-#include "trace.h"
-char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
-typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
-/* Count the events in use (per event id, not per instance) */
-static int      total_profile_count;
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
-{
-        char *buf;
-        int ret = -ENOMEM;
-        if (event->profile_count++ > 0)
-                return 0;
-        if (!total_profile_count) {
-                buf = (char *)alloc_percpu(perf_trace_t);
-                if (!buf)
-                        goto fail_buf;
-                rcu_assign_pointer(perf_trace_buf, buf);
-                buf = (char *)alloc_percpu(perf_trace_t);
-                if (!buf)
-                        goto fail_buf_nmi;
-                rcu_assign_pointer(perf_trace_buf_nmi, buf);
-        }
-        ret = event->profile_enable(event);
-        if (!ret) {
-                total_profile_count++;
-                return 0;
-        }
-fail_buf_nmi:
-        if (!total_profile_count) {
-                free_percpu(perf_trace_buf_nmi);
-                free_percpu(perf_trace_buf);
-                perf_trace_buf_nmi = NULL;
-                perf_trace_buf = NULL;
-        }
-fail_buf:
-        event->profile_count--;
-        return ret;
-}
-int ftrace_profile_enable(int event_id)
-{
-        struct ftrace_event_call *event;
-        int ret = -EINVAL;
-        mutex_lock(&event_mutex);
-        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id && event->profile_enable &&
-                    try_module_get(event->mod)) {
-                        ret = ftrace_profile_enable_event(event);
-                        break;
-                }
-        }
-        mutex_unlock(&event_mutex);
-        return ret;
-}
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
-{
-        char *buf, *nmi_buf;
-        if (--event->profile_count > 0)
-                return;
-        event->profile_disable(event);
-        if (!--total_profile_count) {
-                buf = perf_trace_buf;
-                rcu_assign_pointer(perf_trace_buf, NULL);
-                nmi_buf = perf_trace_buf_nmi;
-                rcu_assign_pointer(perf_trace_buf_nmi, NULL);
-                /*
-                 * Ensure every events in profiling have finished before
-                 * releasing the buffers
-                 */
-                synchronize_sched();
-                free_percpu(buf);
-                free_percpu(nmi_buf);
-        }
-}
-void ftrace_profile_disable(int event_id)
-{
-        struct ftrace_event_call *event;
-        mutex_lock(&event_mutex);
-        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id) {
-                        ftrace_profile_disable_event(event);
-                        module_put(event->mod);
-                        break;
-                }
-        }
-        mutex_unlock(&event_mutex);
-}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <linux/delay.h>
 #include <asm/setup.h>
@@ -28,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+        if (!event_call->class->get_fields)
+                return &event_call->class->fields;
+        return event_call->class->get_fields(event_call);
+}
 int trace_define_field(struct ftrace_event_call *call, const char *type,
                       const char *name, int offset, int size, int is_signed,
                       int filter_type)
 {
        struct ftrace_event_field *field;
+        struct list_head *head;
+        if (WARN_ON(!call->class))
+                return 0;
        field = kzalloc(sizeof(*field), GFP_KERNEL);
        if (!field)
@@ -55,15 +68,14 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
        field->size = size;
        field->is_signed = is_signed;
-        list_add(&field->link, &call->fields);
+        head = trace_get_fields(call);
+        list_add(&field->link, head);
        return 0;
 err:
-        if (field) {
+        if (field)
                kfree(field->name);
-                kfree(field->type);
-        }
        kfree(field);
        return -ENOMEM;
@@ -95,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
 void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
+        struct list_head *head;
-        list_for_each_entry_safe(field, next, &call->fields, link) {
+        head = trace_get_fields(call);
+        list_for_each_entry_safe(field, next, head, link) {
                list_del(&field->link);
                kfree(field->type);
                kfree(field->name);
@@ -108,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 {
        int id;
-        id = register_ftrace_event(call->event);
+        id = register_ftrace_event(&call->event);
        if (!id)
                return -ENODEV;
-        call->id = id;
-        INIT_LIST_HEAD(&call->fields);
        return 0;
 }
@@ -125,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
        switch (enable) {
        case 0:
-                if (call->enabled) {
+                if (call->flags & TRACE_EVENT_FL_ENABLED) {
-                        call->enabled = 0;
+                        call->flags &= ~TRACE_EVENT_FL_ENABLED;
                        tracing_stop_cmdline_record();
-                        call->unregfunc(call);
+                        if (call->class->reg)
+                                call->class->reg(call, TRACE_REG_UNREGISTER);
+                        else
+                                tracepoint_probe_unregister(call->name,
+                                                            call->class->probe,
+                                                            call);
                }
                break;
        case 1:
-                if (!call->enabled) {
+                if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
                        tracing_start_cmdline_record();
-                        ret = call->regfunc(call);
+                        if (call->class->reg)
+                                ret = call->class->reg(call, TRACE_REG_REGISTER);
+                        else
+                                ret = tracepoint_probe_register(call->name,
+                                                                call->class->probe,
+                                                                call);
                        if (ret) {
                                tracing_stop_cmdline_record();
                                pr_info("event trace: Could not enable event "
                                        "%s\n", call->name);
                                break;
                        }
-                        call->enabled = 1;
+                        call->flags |= TRACE_EVENT_FL_ENABLED;
                }
                break;
        }
@@ -172,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->name || !call->regfunc)
+                if (!call->name || !call->class ||
+                    (!call->class->probe && !call->class->reg))
                        continue;
                if (match &&
                    strcmp(match, call->name) != 0 &&
-                    strcmp(match, call->system) != 0)
+                    strcmp(match, call->class->system) != 0)
                        continue;
-                if (sub && strcmp(sub, call->system) != 0)
+                if (sub && strcmp(sub, call->class->system) != 0)
                        continue;
                if (event && strcmp(event, call->name) != 0)
@@ -298,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
                 */
-                if (call->regfunc)
+                if (call->class && (call->class->probe || call->class->reg))
                        return call;
        }
@@ -329,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
        (*pos)++;
        list_for_each_entry_continue(call, &ftrace_events, list) {
-                if (call->enabled)
+                if (call->flags & TRACE_EVENT_FL_ENABLED)
                        return call;
        }
@@ -356,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_event_call *call = v;
-        if (strcmp(call->system, TRACE_SYSTEM) != 0)
+        if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
-                seq_printf(m, "%s:", call->system);
+                seq_printf(m, "%s:", call->class->system);
        seq_printf(m, "%s\n", call->name);
        return 0;
@@ -388,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
        struct ftrace_event_call *call = filp->private_data;
        char *buf;
-        if (call->enabled)
+        if (call->flags & TRACE_EVENT_FL_ENABLED)
                buf = "1\n";
        else
                buf = "0\n";
@@ -451,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->name || !call->regfunc)
+                if (!call->name || !call->class ||
+                    (!call->class->probe && !call->class->reg))
                        continue;
-                if (system && strcmp(call->system, system) != 0)
+                if (system && strcmp(call->class->system, system) != 0)
                        continue;
                /*
@@ -462,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                 * or if all events or cleared, or if we have
                 * a mixture.
                 */
-                set |= (1 << !!call->enabled);
+                set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
                /*
                 * If we have a mixture, no need to look further.
@@ -520,41 +544,17 @@ out:
        return ret;
 }
-extern char *__bad_type_size(void);
-#undef FIELD
-#define FIELD(type, name)                                               \
-        sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
-        #type, "common_" #name, offsetof(typeof(field), name),          \
-                sizeof(field.name), is_signed_type(type)
-static int trace_write_header(struct trace_seq *s)
-{
-        struct trace_entry field;
-        /* struct trace_entry */
-        return trace_seq_printf(s,
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\n",
-                        FIELD(unsigned short, type),
-                        FIELD(unsigned char, flags),
-                        FIELD(unsigned char, preempt_count),
-                        FIELD(int, pid),
-                        FIELD(int, lock_depth));
-}
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_field *field;
+        struct list_head *head;
        struct trace_seq *s;
+        int common_field_count = 5;
        char *buf;
-        int r;
+        int r = 0;
        if (*ppos)
                return 0;
@@ -565,14 +565,49 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
-        /* If any of the first writes fail, so will the show_format. */
        trace_seq_printf(s, "name: %s\n", call->name);
-        trace_seq_printf(s, "ID: %d\n", call->id);
+        trace_seq_printf(s, "ID: %d\n", call->event.type);
        trace_seq_printf(s, "format:\n");
-        trace_write_header(s);
-        r = call->show_format(call, s);
+        head = trace_get_fields(call);
+        list_for_each_entry_reverse(field, head, link) {
+                /*
+                 * Smartly shows the array type(except dynamic array).
+                 * Normal:
+                 *      field:TYPE VAR
+                 * If TYPE := TYPE[LEN], it is shown:
+                 *      field:TYPE VAR[LEN]
+                 */
+                const char *array_descriptor = strchr(field->type, '[');
+                if (!strncmp(field->type, "__data_loc", 10))
+                        array_descriptor = NULL;
+                if (!array_descriptor) {
+                        r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        field->type, field->name, field->offset,
+                                        field->size, !!field->is_signed);
+                } else {
+                        r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        (int)(array_descriptor - field->type),
+                                        field->type, field->name,
+                                        array_descriptor, field->offset,
+                                        field->size, !!field->is_signed);
+                }
+                if (--common_field_count == 0)
+                        r = trace_seq_printf(s, "\n");
+                if (!r)
+                        break;
+        }
+        if (r)
+                r = trace_seq_printf(s, "\nprint fmt: %s\n",
+                                call->print_fmt);
        if (!r) {
                /*
                 * ug!  The format output is bigger than a PAGE!!
@@ -605,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
                return -ENOMEM;
        trace_seq_init(s);
-        trace_seq_printf(s, "%d\n", call->id);
+        trace_seq_printf(s, "%d\n", call->event.type);
        r = simple_read_from_buffer(ubuf, cnt, ppos,
                                    s->buffer, s->len);
@@ -911,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                 const struct file_operations *filter,
                 const struct file_operations *format)
 {
+        struct list_head *head;
        int ret;
        /*
         * If the trace point header did not define TRACE_SYSTEM
         * then the system would be called "TRACE_SYSTEM".
         */
-        if (strcmp(call->system, TRACE_SYSTEM) != 0)
+        if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
-                d_events = event_subsystem_dir(call->system, d_events);
+                d_events = event_subsystem_dir(call->class->system, d_events);
        call->dir = debugfs_create_dir(call->name, d_events);
        if (!call->dir) {
@@ -927,31 +963,36 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                return -1;
        }
-        if (call->regfunc)
+        if (call->class->probe || call->class->reg)
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
-        if (call->id && call->profile_enable)
+#ifdef CONFIG_PERF_EVENTS
+        if (call->event.type && (call->class->perf_probe || call->class->reg))
                trace_create_file("id", 0444, call->dir, call,
                                  id);
+#endif
-        if (call->define_fields) {
+        if (call->class->define_fields) {
-                ret = trace_define_common_fields(call);
+                /*
-                if (!ret)
+                 * Other events may have the same class. Only update
-                        ret = call->define_fields(call);
+                 * the fields if they are not already defined.
-                if (ret < 0) {
+                 */
-                        pr_warning("Could not initialize trace point"
+                head = trace_get_fields(call);
-                                   " events/%s\n", call->name);
+                if (list_empty(head)) {
-                        return ret;
+                        ret = trace_define_common_fields(call);
+                        if (!ret)
+                                ret = call->class->define_fields(call);
+                        if (ret < 0) {
+                                pr_warning("Could not initialize trace point"
+                                           " events/%s\n", call->name);
+                                return ret;
+                        }
                }
                trace_create_file("filter", 0644, call->dir, call,
                                  filter);
        }
-        /* A trace may not want to export its format */
-        if (!call->show_format)
-                return 0;
        trace_create_file("format", 0444, call->dir, call,
                          format);
@@ -966,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
        if (!call->name)
                return -EINVAL;
-        if (call->raw_init) {
+        if (call->class->raw_init) {
-                ret = call->raw_init(call);
+                ret = call->class->raw_init(call);
                if (ret < 0) {
                        if (ret != -ENOSYS)
                                pr_warning("Could not initialize trace "
@@ -1031,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
        ftrace_event_enable_disable(call, 0);
-        if (call->event)
+        if (call->event.funcs)
-                __unregister_ftrace_event(call->event);
+                __unregister_ftrace_event(&call->event);
        debugfs_remove_recursive(call->dir);
        list_del(&call->list);
        trace_destroy_fields(call);
        destroy_preds(call);
-        remove_subsystem_dir(call->system);
+        remove_subsystem_dir(call->class->system);
 }
 /* Remove an event_call */
@@ -1128,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-                if (call->raw_init) {
+                if (call->class->raw_init) {
-                        ret = call->raw_init(call);
+                        ret = call->class->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1282,8 +1323,8 @@ static __init int event_trace_init(void)
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-                if (call->raw_init) {
+                if (call->class->raw_init) {
-                        ret = call->raw_init(call);
+                        ret = call->class->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1384,8 +1425,8 @@ static __init void event_trace_self_tests(void)
        list_for_each_entry(call, &ftrace_events, list) {
-                /* Only test those that have a regfunc */
+                /* Only test those that have a probe */
-                if (!call->regfunc)
+                if (!call->class || !call->class->probe)
                        continue;
 /*
@@ -1395,8 +1436,8 @@ static __init void event_trace_self_tests(void)
 * syscalls as we test.
 */
 #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
-                if (call->system &&
+                if (call->class->system &&
-                    strcmp(call->system, "syscalls") == 0)
+                    strcmp(call->class->system, "syscalls") == 0)
                        continue;
 #endif
@@ -1406,7 +1447,7 @@ static __init void event_trace_self_tests(void)
                 * If an event is already enabled, someone is using
                 * it and the self test should not be on.
                 */
-                if (call->enabled) {
+                if (call->flags & TRACE_EVENT_FL_ENABLED) {
                        pr_warning("Enabled event during self test!\n");
                        WARN_ON_ONCE(1);
                        continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69f..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/mutex.h>
 #include <linux/perf_event.h>
+#include <linux/slab.h>
 #include "trace.h"
 #include "trace_output.h"
@@ -499,8 +500,10 @@ static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
        struct ftrace_event_field *field;
+        struct list_head *head;
-        list_for_each_entry(field, &call->fields, link) {
+        head = trace_get_fields(call);
+        list_for_each_entry(field, head, link) {
                if (!strcmp(field->name, name))
                        return field;
        }
@@ -544,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
        struct event_filter *filter = call->filter;
        int i;
-        call->filter_active = 0;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
        filter->n_preds = 0;
        for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -571,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
 {
        __free_preds(call->filter);
        call->filter = NULL;
-        call->filter_active = 0;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 static struct event_filter *__alloc_preds(void)
@@ -610,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
        if (call->filter)
                return 0;
-        call->filter_active = 0;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
        call->filter = __alloc_preds();
        if (IS_ERR(call->filter))
                return PTR_ERR(call->filter);
@@ -624,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
        int err;
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->define_fields)
+                if (!call->class || !call->class->define_fields)
                        continue;
-                if (strcmp(call->system, system->name) != 0)
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                err = init_preds(call);
@@ -643,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
        struct ftrace_event_call *call;
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->define_fields)
+                if (!call->class || !call->class->define_fields)
                        continue;
-                if (strcmp(call->system, system->name) != 0)
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                filter_disable_preds(call);
@@ -1248,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
        list_for_each_entry(call, &ftrace_events, list) {
                struct event_filter *filter = call->filter;
-                if (!call->define_fields)
+                if (!call->class || !call->class->define_fields)
                        continue;
-                if (strcmp(call->system, system->name) != 0)
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                /* try to see if the filter can be applied */
@@ -1265,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
                if (err)
                        filter_disable_preds(call);
                else {
-                        call->filter_active = 1;
+                        call->flags |= TRACE_EVENT_FL_FILTERED;
                        replace_filter_string(filter, filter_string);
                }
                fail = false;
@@ -1314,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (err)
                append_filter_err(ps, call->filter);
        else
-                call->filter_active = 1;
+                call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
        filter_opstack_clear(ps);
        postfix_clear(ps);
@@ -1371,7 +1374,7 @@ out_unlock:
        return err;
 }
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 void ftrace_profile_free_filter(struct perf_event *event)
 {
@@ -1392,12 +1395,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (call->id == event_id)
+                if (call->event.type == event_id)
                        break;
        }
        err = -EINVAL;
-        if (!call)
+        if (&call->list == &ftrace_events)
                goto out_unlock;
        err = -EEXIST;
@@ -1439,5 +1442,5 @@ out_unlock:
        return err;
 }
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #include "trace_entries.h"
-#undef __field
-#define __field(type, item)                                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), item),           \
-                               sizeof(field.item), is_signed_type(type)); \
-        if (!ret)                                                       \
-                return 0;
-#undef __field_desc
-#define __field_desc(type, container, item)                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item),            \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef __array
-#define __array(type, item, len)                                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), item),           \
-                               sizeof(field.item), is_signed_type(type)); \
-        if (!ret)                                                       \
-                return 0;
-#undef __array_desc
-#define __array_desc(type, container, item, len)                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item),            \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef __dynamic_array
-#define __dynamic_array(type, item)                                     \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:0;\tsigned:%u;\n",    \
-                               offsetof(typeof(field), item),           \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef F_printk
-#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-#undef __entry
-#define __entry REC
-#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
-static int                                                              \
-ftrace_format_##name(struct ftrace_event_call *unused,                  \
-                     struct trace_seq *s)                               \
-{                                                                       \
-        struct struct_name field __attribute__((unused));               \
-        int ret = 0;                                                    \
-                                                                        \
-        tstruct;                                                        \
-                                                                        \
-        trace_seq_printf(s, "\nprint fmt: " print);                     \
-                                                                        \
-        return ret;                                                     \
-}
-#include "trace_entries.h"
 #undef __field
 #define __field(type, item)                                             \
        ret = trace_define_field(event_call, #type, #item,              \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
                return ret;
 #undef __dynamic_array
-#define __dynamic_array(type, item)
+#define __dynamic_array(type, item)                                     \
+        ret = trace_define_field(event_call, #type, #item,              \
+                                 offsetof(typeof(field), item),         \
+                                 0, is_signed_type(type), FILTER_OTHER);\
+        if (ret)                                                        \
+                return ret;
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
@@ -194,10 +127,13 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 static int ftrace_raw_init_event(struct ftrace_event_call *call)
 {
-        INIT_LIST_HEAD(&call->fields);
+        INIT_LIST_HEAD(&call->class->fields);
        return 0;
 }
+#undef __entry
+#define __entry REC
 #undef __field
 #define __field(type, item)
@@ -213,18 +149,25 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 #undef __dynamic_array
 #define __dynamic_array(type, item)
+#undef F_printk
+#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, type, tstruct, print)           \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)          \
+                                                                        \
+struct ftrace_event_class event_class_ftrace_##call = {                 \
+        .system                 = __stringify(TRACE_SYSTEM),            \
+        .define_fields          = ftrace_define_fields_##call,          \
+        .raw_init               = ftrace_raw_init_event,                \
+};                                                                      \
                                                                        \
 struct ftrace_event_call __used                                         \
 __attribute__((__aligned__(4)))                                         \
 __attribute__((section("_ftrace_events"))) event_##call = {             \
        .name                   = #call,                                \
-        .id                     = type,                                 \
+        .event.type             = etype,                                \
-        .system                 = __stringify(TRACE_SYSTEM),            \
+        .class                  = &event_class_ftrace_##call,           \
-        .raw_init               = ftrace_raw_init_event,                \
+        .print_fmt              = print,                                \
-        .show_format            = ftrace_format_##call,                 \
-        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                      \
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -18,6 +19,7 @@ struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
        int             ignore;
+        unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 struct fgraph_data {
@@ -38,7 +40,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_OVERHEAD      0x4
 #define TRACE_GRAPH_PRINT_PROC          0x8
 #define TRACE_GRAPH_PRINT_DURATION      0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME      0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
@@ -177,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        return ret;
 }
-static int __trace_graph_entry(struct trace_array *tr,
+int __trace_graph_entry(struct trace_array *tr,
                                struct ftrace_graph_ent *trace,
                                unsigned long flags,
                                int pc)
@@ -187,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ent_entry *entry;
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return 0;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        int cpu;
        int pc;
-        if (unlikely(!tr))
-                return 0;
        if (!ftrace_trace_task(current))
                return 0;
-        if (!ftrace_graph_addr(trace->func))
+        /* trace it when it is-nested-in or is a function enabled. */
+        if (!(trace->depth || ftrace_graph_addr(trace->func)))
                return 0;
        local_irq_save(flags);
@@ -231,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        } else {
                ret = 0;
        }
-        /* Only do the atomic if it is not already set */
-        if (!test_tsk_trace_graph(current))
-                set_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
@@ -241,7 +238,15 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        return ret;
 }
-static void __trace_graph_return(struct trace_array *tr,
+int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+        if (tracing_thresh)
+                return 1;
+        else
+                return trace_graph_entry(trace);
+}
+void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
                                int pc)
@@ -251,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ret_entry *entry;
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
                pc = preempt_count();
                __trace_graph_return(tr, trace, flags, pc);
        }
-        if (!trace->depth)
-                clear_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
 }
+void set_graph_array(struct trace_array *tr)
+{
+        graph_array = tr;
+        /* Make graph_array visible before we start tracing */
+        smp_mb();
+}
+void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+        if (tracing_thresh &&
+            (trace->rettime - trace->calltime < tracing_thresh))
+                return;
+        else
+                trace_graph_return(trace);
+}
 static int graph_trace_init(struct trace_array *tr)
 {
        int ret;
-        graph_array = tr;
+        set_graph_array(tr);
-        ret = register_ftrace_graph(&trace_graph_return,
+        if (tracing_thresh)
-                                    &trace_graph_entry);
+                ret = register_ftrace_graph(&trace_graph_thresh_return,
+                                            &trace_graph_thresh_entry);
+        else
+                ret = register_ftrace_graph(&trace_graph_return,
+                                            &trace_graph_entry);
        if (ret)
                return ret;
        tracing_start_cmdline_record();
@@ -301,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr)
        return 0;
 }
-void set_graph_array(struct trace_array *tr)
-{
-        graph_array = tr;
-}
 static void graph_trace_reset(struct trace_array *tr)
 {
        tracing_stop_cmdline_record();
@@ -470,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
                         * We need to consume the current entry to see
                         * the next one.
                         */
-                        ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                        ring_buffer_consume(iter->tr->buffer, iter->cpu,
+                                            NULL, NULL);
                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-                                                 NULL);
+                                                 NULL, NULL);
                }
                if (!event)
@@ -506,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
 /* Signal a overhead of time execution to the output */
 static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+                     u32 flags)
 {
        /* If duration disappear, we don't need anything */
-        if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+        if (!(flags & TRACE_GRAPH_PRINT_DURATION))
                return 1;
        /* Non nested entry or return */
        if (duration == -1)
                return trace_seq_printf(s, "  ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
                /* Duration exceeded 100 msecs */
                if (duration > 100000ULL)
                        return trace_seq_printf(s, "! ");
@@ -542,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 static enum print_line_t
 print_graph_irq(struct trace_iterator *iter, unsigned long addr,
-                enum trace_type type, int cpu, pid_t pid)
+                enum trace_type type, int cpu, pid_t pid, u32 flags)
 {
        int ret;
        struct trace_seq *s = &iter->seq;
@@ -552,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                return TRACE_TYPE_UNHANDLED;
        /* Absolute time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Cpu */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+        if (flags & TRACE_GRAPH_PRINT_CPU) {
                ret = print_graph_cpu(s, cpu);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Proc */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+        if (flags & TRACE_GRAPH_PRINT_PROC) {
                ret = print_graph_proc(s, pid);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -576,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        }
        /* No overhead */
-        ret = print_graph_overhead(-1, s);
+        ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -589,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                return TRACE_TYPE_PARTIAL_LINE;
        /* Don't close the duration column if haven't one */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+        if (flags & TRACE_GRAPH_PRINT_DURATION)
                trace_seq_printf(s, " |");
        ret = trace_seq_printf(s, "\n");
@@ -659,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *entry,
-                struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+                struct ftrace_graph_ret_entry *ret_entry,
+                struct trace_seq *s, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ret *graph_ret;
@@ -673,24 +696,30 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        duration = graph_ret->rettime - graph_ret->calltime;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. Since
                 * this is a leaf function, keep the comments
                 * equal to this depth.
                 */
-                *depth = call->depth - 1;
+                cpu_data->depth = call->depth - 1;
+                /* No need to keep this function around for this depth */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = 0;
        }
        /* Overhead */
-        ret = print_graph_overhead(duration, s);
+        ret = print_graph_overhead(duration, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* Duration */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = print_graph_duration(duration, s);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -713,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_entry_nested(struct trace_iterator *iter,
                         struct ftrace_graph_ent_entry *entry,
-                         struct trace_seq *s, int cpu)
+                         struct trace_seq *s, int cpu, u32 flags)
 {
        struct ftrace_graph_ent *call = &entry->graph_ent;
        struct fgraph_data *data = iter->private;
@@ -721,19 +750,24 @@ print_graph_entry_nested(struct trace_iterator *iter,
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
-                *depth = call->depth;
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+                cpu_data->depth = call->depth;
+                /* Save this function pointer to see if the exit matches */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = call->func;
        }
        /* No overhead */
-        ret = print_graph_overhead(-1, s);
+        ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = trace_seq_printf(s, "            |  ");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -759,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
-                     int type, unsigned long addr)
+                     int type, unsigned long addr, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct trace_entry *ent = iter->ent;
@@ -772,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
        if (type) {
                /* Interrupt */
-                ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+                ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Absolute time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Cpu */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+        if (flags & TRACE_GRAPH_PRINT_CPU) {
                ret = print_graph_cpu(s, cpu);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Proc */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+        if (flags & TRACE_GRAPH_PRINT_PROC) {
                ret = print_graph_proc(s, ent->pid);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -814,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-                        struct trace_iterator *iter)
+                        struct trace_iterator *iter, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
@@ -822,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
        static enum print_line_t ret;
        int cpu = iter->cpu;
-        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                return TRACE_TYPE_PARTIAL_LINE;
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-                ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
+                ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
        else
-                ret = print_graph_entry_nested(iter, field, s, cpu);
+                ret = print_graph_entry_nested(iter, field, s, cpu, flags);
        if (data) {
                /*
@@ -848,37 +882,47 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-                   struct trace_entry *ent, struct trace_iterator *iter)
+                   struct trace_entry *ent, struct trace_iterator *iter,
+                   u32 flags)
 {
        unsigned long long duration = trace->rettime - trace->calltime;
        struct fgraph_data *data = iter->private;
        pid_t pid = ent->pid;
        int cpu = iter->cpu;
+        int func_match = 1;
        int ret;
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. This is the
                 * return from a function, we now want the comments
                 * to display at the same level of the bracket.
                 */
-                *depth = trace->depth - 1;
+                cpu_data->depth = trace->depth - 1;
+                if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+                        if (cpu_data->enter_funcs[trace->depth] != trace->func)
+                                func_match = 0;
+                        cpu_data->enter_funcs[trace->depth] = 0;
+                }
        }
-        if (print_graph_prologue(iter, s, 0, 0))
+        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
        /* Overhead */
-        ret = print_graph_overhead(duration, s);
+        ret = print_graph_overhead(duration, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* Duration */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = print_graph_duration(duration, s);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -891,19 +935,32 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
-        ret = trace_seq_printf(s, "}\n");
+        /*
-        if (!ret)
+         * If the return function does not have a matching entry,
-                return TRACE_TYPE_PARTIAL_LINE;
+         * then the entry was lost. Instead of just printing
+         * the '}' and letting the user guess what function this
+         * belongs to, write out the function name.
+         */
+        if (func_match) {
+                ret = trace_seq_printf(s, "}\n");
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        } else {
+                ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        }
        /* Overrun */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+        if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
                ret = trace_seq_printf(s, " (Overruns: %lu)\n",
                                        trace->overrun);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
-        ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+        ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+                              cpu, pid, flags);
        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -911,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 static enum print_line_t
-print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
-                    struct trace_iterator *iter)
+                    struct trace_iterator *iter, u32 flags)
 {
        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
        struct fgraph_data *data = iter->private;
@@ -924,16 +981,16 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
        if (data)
                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
-        if (print_graph_prologue(iter, s, 0, 0))
+        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
        /* No overhead */
-        ret = print_graph_overhead(-1, s);
+        ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = trace_seq_printf(s, "            |  ");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -968,7 +1025,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
                if (!event)
                        return TRACE_TYPE_UNHANDLED;
-                ret = event->trace(iter, sym_flags);
+                ret = event->funcs->trace(iter, sym_flags, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }
@@ -988,7 +1045,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1009,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
        if (data && data->failed) {
                field = &data->ent;
                iter->cpu = data->cpu;
-                ret = print_graph_entry(field, s, iter);
+                ret = print_graph_entry(field, s, iter, flags);
                if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
                        per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
                        ret = TRACE_TYPE_NO_CONSUME;
@@ -1029,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
                struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
-                return print_graph_entry(&saved, s, iter);
+                return print_graph_entry(&saved, s, iter, flags);
        }
        case TRACE_GRAPH_RET: {
                struct ftrace_graph_ret_entry *field;
                trace_assign_type(field, entry);
-                return print_graph_return(&field->ret, s, entry, iter);
+                return print_graph_return(&field->ret, s, entry, iter, flags);
        }
+        case TRACE_STACK:
+        case TRACE_FN:
+                /* dont trace stack and functions as comments */
+                return TRACE_TYPE_UNHANDLED;
        default:
-                return print_graph_comment(s, entry, iter);
+                return print_graph_comment(s, entry, iter, flags);
        }
        return TRACE_TYPE_HANDLED;
 }
-static void print_lat_header(struct seq_file *s)
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+        return print_graph_function_flags(iter, tracer_flags.val);
+}
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags,
+                           struct trace_event *event)
+{
+        return print_graph_function(iter);
+}
+static void print_lat_header(struct seq_file *s, u32 flags)
 {
        static const char spaces[] = "                " /* 16 spaces */
                "    "                                  /* 4 spaces */
                "                 ";                    /* 17 spaces */
        int size = 0;
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                size += 16;
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+        if (flags & TRACE_GRAPH_PRINT_CPU)
                size += 4;
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+        if (flags & TRACE_GRAPH_PRINT_PROC)
                size += 17;
        seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces);
@@ -1065,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
        if (lat)
-                print_lat_header(s);
+                print_lat_header(s, flags);
        /* 1st line */
        seq_printf(s, "#");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                seq_printf(s, "     TIME       ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+        if (flags & TRACE_GRAPH_PRINT_CPU)
                seq_printf(s, " CPU");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "  TASK/PID       ");
        if (lat)
                seq_printf(s, "|||||");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "  DURATION   ");
        seq_printf(s, "               FUNCTION CALLS\n");
        /* 2nd line */
        seq_printf(s, "#");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                seq_printf(s, "      |         ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+        if (flags & TRACE_GRAPH_PRINT_CPU)
                seq_printf(s, " |  ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "   |    |        ");
        if (lat)
                seq_printf(s, "|||||");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "   |   |      ");
        seq_printf(s, "               |   |   |   |\n");
 }
-static void graph_trace_open(struct trace_iterator *iter)
+void print_graph_headers(struct seq_file *s)
+{
+        print_graph_headers_flags(s, tracer_flags.val);
+}
+void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
        struct fgraph_data *data;
@@ -1136,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
        pr_warning("function graph tracer: not enough memory\n");
 }
-static void graph_trace_close(struct trace_iterator *iter)
+void graph_trace_close(struct trace_iterator *iter)
 {
        struct fgraph_data *data = iter->private;
@@ -1146,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
        }
 }
+static struct trace_event_functions graph_functions = {
+        .trace          = print_graph_function_event,
+};
+static struct trace_event graph_trace_entry_event = {
+        .type           = TRACE_GRAPH_ENT,
+        .funcs          = &graph_functions,
+};
+static struct trace_event graph_trace_ret_event = {
+        .type           = TRACE_GRAPH_RET,
+        .funcs          = &graph_functions
+};
 static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
@@ -1167,6 +1261,16 @@ static __init int init_graph_trace(void)
 {
        max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+        if (!register_ftrace_event(&graph_trace_entry_event)) {
+                pr_warning("Warning: could not register graph trace events\n");
+                return 1;
+        }
+        if (!register_ftrace_event(&graph_trace_ret_event)) {
+                pr_warning("Warning: could not register graph trace events\n");
+                return 1;
+        }
        return register_tracer(&graph_trace);
 }
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#include <asm/ds.h>
-#include "trace_output.h"
-#include "trace.h"
-#define BTS_BUFFER_SIZE (1 << 13)
-static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-static void bts_trace_init_cpu(int cpu)
-{
-        per_cpu(hwb_tracer, cpu) =
-                ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-                                   BTS_BUFFER_SIZE, NULL, (size_t)-1,
-                                   BTS_KERNEL);
-        if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-                per_cpu(hwb_tracer, cpu) = NULL;
-}
-static int bts_trace_init(struct trace_array *tr)
-{
-        int cpu;
-        hw_branch_trace = tr;
-        trace_hw_branches_enabled = 0;
-        get_online_cpus();
-        for_each_online_cpu(cpu) {
-                bts_trace_init_cpu(cpu);
-                if (likely(per_cpu(hwb_tracer, cpu)))
-                        trace_hw_branches_enabled = 1;
-        }
-        trace_hw_branches_suspended = 0;
-        put_online_cpus();
-        /* If we could not enable tracing on a single cpu, we fail. */
-        return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-static void bts_trace_reset(struct trace_array *tr)
-{
-        int cpu;
-        get_online_cpus();
-        for_each_online_cpu(cpu) {
-                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(hwb_tracer, cpu) = NULL;
-                }
-        }
-        trace_hw_branches_enabled = 0;
-        trace_hw_branches_suspended = 0;
-        put_online_cpus();
-}
-static void bts_trace_start(struct trace_array *tr)
-{
-        int cpu;
-        get_online_cpus();
-        for_each_online_cpu(cpu)
-                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
-        trace_hw_branches_suspended = 0;
-        put_online_cpus();
-}
-static void bts_trace_stop(struct trace_array *tr)
-{
-        int cpu;
-        get_online_cpus();
-        for_each_online_cpu(cpu)
-                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-        trace_hw_branches_suspended = 1;
-        put_online_cpus();
-}
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
-                                     unsigned long action, void *hcpu)
-{
-        int cpu = (long)hcpu;
-        switch (action) {
-        case CPU_ONLINE:
-        case CPU_DOWN_FAILED:
-                /* The notification is sent with interrupts enabled. */
-                if (trace_hw_branches_enabled) {
-                        bts_trace_init_cpu(cpu);
-                        if (trace_hw_branches_suspended &&
-                            likely(per_cpu(hwb_tracer, cpu)))
-                                ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-                }
-                break;
-        case CPU_DOWN_PREPARE:
-                /* The notification is sent with interrupts enabled. */
-                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(hwb_tracer, cpu) = NULL;
-                }
-        }
-        return NOTIFY_DONE;
-}
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
-        .notifier_call = bts_hotcpu_handler
-};
-static void bts_trace_print_header(struct seq_file *m)
-{
-        seq_puts(m, "# CPU#        TO  <-  FROM\n");
-}
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-        unsigned long symflags = TRACE_ITER_SYM_OFFSET;
-        struct trace_entry *entry = iter->ent;
-        struct trace_seq *seq = &iter->seq;
-        struct hw_branch_entry *it;
-        trace_assign_type(it, entry);
-        if (entry->type == TRACE_HW_BRANCHES) {
-                if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
-                    seq_print_ip_sym(seq, it->to, symflags) &&
-                    trace_seq_printf(seq, "\t  <-  ") &&
-                    seq_print_ip_sym(seq, it->from, symflags) &&
-                    trace_seq_printf(seq, "\n"))
-                        return TRACE_TYPE_HANDLED;
-                return TRACE_TYPE_PARTIAL_LINE;
-        }
-        return TRACE_TYPE_UNHANDLED;
-}
-void trace_hw_branch(u64 from, u64 to)
-{
-        struct ftrace_event_call *call = &event_hw_branch;
-        struct trace_array *tr = hw_branch_trace;
-        struct ring_buffer_event *event;
-        struct ring_buffer *buf;
-        struct hw_branch_entry *entry;
-        unsigned long irq1;
-        int cpu;
-        if (unlikely(!tr))
-                return;
-        if (unlikely(!trace_hw_branches_enabled))
-                return;
-        local_irq_save(irq1);
-        cpu = raw_smp_processor_id();
-        if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-                goto out;
-        buf = tr->buffer;
-        event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
-                                          sizeof(*entry), 0, 0);
-        if (!event)
-                goto out;
-        entry   = ring_buffer_event_data(event);
-        tracing_generic_entry_update(&entry->ent, 0, from);
-        entry->ent.type = TRACE_HW_BRANCHES;
-        entry->from = from;
-        entry->to   = to;
-        if (!filter_check_discard(call, entry, buf, event))
-                trace_buffer_unlock_commit(buf, event, 0, 0);
- out:
-        atomic_dec(&tr->data[cpu]->disabled);
-        local_irq_restore(irq1);
-}
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
-        struct bts_struct bts;
-        int err = 0;
-        WARN_ON_ONCE(!trace->read);
-        if (!trace->read)
-                return;
-        err = trace->read(this_tracer, at, &bts);
-        if (err < 0)
-                return;
-        switch (bts.qualifier) {
-        case BTS_BRANCH:
-                trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
-                break;
-        }
-}
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
-        struct trace_array *tr = (struct trace_array *)arg;
-        const struct bts_trace *trace;
-        unsigned char *at;
-        if (unlikely(!tr))
-                return;
-        if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
-                return;
-        if (unlikely(!this_tracer))
-                return;
-        trace = ds_read_bts(this_tracer);
-        if (!trace)
-                return;
-        for (at = trace->ds.top; (void *)at < trace->ds.end;
-             at += trace->ds.size)
-                trace_bts_at(trace, at);
-        for (at = trace->ds.begin; (void *)at < trace->ds.top;
-             at += trace->ds.size)
-                trace_bts_at(trace, at);
-}
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-        int cpu;
-        get_online_cpus();
-        for_each_online_cpu(cpu)
-                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-        /*
-         * We need to collect the trace on the respective cpu since ftrace
-         * implicitly adds the record for the current cpu.
-         * Once that is more flexible, we could collect the data from any cpu.
-         */
-        on_each_cpu(trace_bts_cpu, iter->tr, 1);
-        for_each_online_cpu(cpu)
-                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
-        put_online_cpus();
-}
-static void trace_bts_close(struct trace_iterator *iter)
-{
-        tracing_reset_online_cpus(iter->tr);
-}
-void trace_hw_branch_oops(void)
-{
-        if (this_tracer) {
-                ds_suspend_bts_noirq(this_tracer);
-                trace_bts_cpu(hw_branch_trace);
-                ds_resume_bts_noirq(this_tracer);
-        }
-}
-struct tracer bts_tracer __read_mostly =
-{
-        .name           = "hw-branch-tracer",
-        .init           = bts_trace_init,
-        .reset          = bts_trace_reset,
-        .print_header   = bts_trace_print_header,
-        .print_line     = bts_trace_print_line,
-        .start          = bts_trace_start,
-        .stop           = bts_trace_stop,
-        .open           = trace_bts_prepare,
-        .close          = trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
-        .selftest       = trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-__init static int init_bts_trace(void)
-{
-        register_hotcpu_notifier(&bts_hotcpu_notifier);
-        return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
 static int save_lat_flag;
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
 # define irq_trace() (0)
 #endif
+#define TRACE_DISPLAY_GRAPH     1
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        /* display latency trace as call graph */
+        { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+        { } /* Empty entry */
+};
+static struct tracer_flags tracer_flags = {
+        .val  = 0,
+        .opts = trace_opts,
+};
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
 /*
 * Sequence count - we record it when starting a measurement and
 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+        int cpu;
+        if (!(bit & TRACE_DISPLAY_GRAPH))
+                return -EINVAL;
+        if (!(is_graph() ^ set))
+                return 0;
+        stop_irqsoff_tracer(irqsoff_trace, !set);
+        for_each_possible_cpu(cpu)
+                per_cpu(tracing_cpu, cpu) = 0;
+        tracing_max_latency = 0;
+        tracing_reset_online_cpus(irqsoff_trace);
+        return start_irqsoff_tracer(irqsoff_trace, set);
+}
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int ret;
+        int cpu;
+        int pc;
+        cpu = raw_smp_processor_id();
+        if (likely(!per_cpu(tracing_cpu, cpu)))
+                return 0;
+        local_save_flags(flags);
+        /* slight chance to get a false positive on tracing_cpu */
+        if (!irqs_disabled_flags(flags))
+                return 0;
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1)) {
+                pc = preempt_count();
+                ret = __trace_graph_entry(tr, trace, flags, pc);
+        } else
+                ret = 0;
+        atomic_dec(&data->disabled);
+        return ret;
+}
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        int pc;
+        cpu = raw_smp_processor_id();
+        if (likely(!per_cpu(tracing_cpu, cpu)))
+                return;
+        local_save_flags(flags);
+        /* slight chance to get a false positive on tracing_cpu */
+        if (!irqs_disabled_flags(flags))
+                return;
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1)) {
+                pc = preempt_count();
+                __trace_graph_return(tr, trace, flags, pc);
+        }
+        atomic_dec(&data->disabled);
+}
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+        if (is_graph())
+                graph_trace_open(iter);
+}
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+        if (iter->private)
+                graph_trace_close(iter);
+}
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+                            TRACE_GRAPH_PRINT_PROC)
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+        u32 flags = GRAPH_TRACER_FLAGS;
+        if (trace_flags & TRACE_ITER_LATENCY_FMT)
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        /*
+         * In graph mode call the graph tracer output function,
+         * otherwise go with the TRACE_FN event handler
+         */
+        if (is_graph())
+                return print_graph_function_flags(iter, flags);
+        return TRACE_TYPE_UNHANDLED;
+}
+static void irqsoff_print_header(struct seq_file *s)
+{
+        if (is_graph()) {
+                struct trace_iterator *iter = s->private;
+                u32 flags = GRAPH_TRACER_FLAGS;
+                if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+                        /* print nothing if the buffers are empty */
+                        if (trace_empty(iter))
+                                return;
+                        print_trace_header(s, iter);
+                        flags |= TRACE_GRAPH_PRINT_DURATION;
+                } else
+                        flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+                print_graph_headers_flags(s, flags);
+        } else
+                trace_default_header(s);
+}
+static void
+trace_graph_function(struct trace_array *tr,
+                 unsigned long ip, unsigned long flags, int pc)
+{
+        u64 time = trace_clock_local();
+        struct ftrace_graph_ent ent = {
+                .func  = ip,
+                .depth = 0,
+        };
+        struct ftrace_graph_ret ret = {
+                .func     = ip,
+                .depth    = 0,
+                .calltime = time,
+                .rettime  = time,
+        };
+        __trace_graph_entry(tr, &ent, flags, pc);
+        __trace_graph_return(tr, &ret, flags, pc);
+}
+static void
+__trace_function(struct trace_array *tr,
+                 unsigned long ip, unsigned long parent_ip,
+                 unsigned long flags, int pc)
+{
+        if (!is_graph())
+                trace_function(tr, ip, parent_ip, flags, pc);
+        else {
+                trace_graph_function(tr, parent_ip, flags, pc);
+                trace_graph_function(tr, ip, flags, pc);
+        }
+}
+#else
+#define __trace_function trace_function
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+        return -EINVAL;
+}
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+        return -1;
+}
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+        return TRACE_TYPE_UNHANDLED;
+}
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
 * Should this new latency be reported/recorded?
 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
        if (!report_latency(delta))
                goto out_unlock;
-        trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+        __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
        /* Skip 5 functions to get to the irq/preempt enable function */
        __trace_stack(tr, flags, 5, pc);
@@ -172,7 +388,7 @@ out_unlock:
 out:
        data->critical_sequence = max_sequence;
        data->preempt_timestamp = ftrace_now(cpu);
-        trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+        __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
        local_save_flags(flags);
-        trace_function(tr, ip, parent_ip, flags, preempt_count());
+        __trace_function(tr, ip, parent_ip, flags, preempt_count());
        per_cpu(tracing_cpu, cpu) = 1;
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
        atomic_inc(&data->disabled);
        local_save_flags(flags);
-        trace_function(tr, ip, parent_ip, flags, preempt_count());
+        __trace_function(tr, ip, parent_ip, flags, preempt_count());
        check_critical_timing(tr, data, parent_ip ? : ip, cpu);
        data->critical_start = 0;
        atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
-        register_ftrace_function(&trace_ops);
+        int ret = 0;
-        if (tracing_is_enabled())
+        if (!graph)
+                ret = register_ftrace_function(&trace_ops);
+        else
+                ret = register_ftrace_graph(&irqsoff_graph_return,
+                                            &irqsoff_graph_entry);
+        if (!ret && tracing_is_enabled())
                tracer_enabled = 1;
        else
                tracer_enabled = 0;
+        return ret;
 }
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
        tracer_enabled = 0;
-        unregister_ftrace_function(&trace_ops);
+        if (!graph)
+                unregister_ftrace_function(&trace_ops);
+        else
+                unregister_ftrace_graph();
 }
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
        /* make sure that the tracer is visible */
        smp_wmb();
        tracing_reset_online_cpus(tr);
-        start_irqsoff_tracer(tr);
+        if (start_irqsoff_tracer(tr, is_graph()))
+                printk(KERN_ERR "failed to start irqsoff tracer\n");
 }
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-        stop_irqsoff_tracer(tr);
+        stop_irqsoff_tracer(tr, is_graph());
        if (!save_lat_flag)
                trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+        .print_header   = irqsoff_print_header,
+        .print_line     = irqsoff_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_irqsoff,
 #endif
+        .open           = irqsoff_trace_open,
+        .close          = irqsoff_trace_close,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+        .print_header   = irqsoff_print_header,
+        .print_line     = irqsoff_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptoff,
 #endif
+        .open           = irqsoff_trace_open,
+        .close          = irqsoff_trace_close,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+        .print_header   = irqsoff_print_header,
+        .print_line     = irqsoff_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
+        .open           = irqsoff_trace_open,
+        .close          = irqsoff_trace_close,
 };
 # define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ea90c0e2c96..f52b5f50299d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
 #include <linux/ctype.h>
 #include <linux/ptrace.h>
 #include <linux/perf_event.h>
+#include <linux/stringify.h>
+#include <asm/bitsperlong.h>
 #include "trace.h"
 #include "trace_output.h"
@@ -40,7 +42,6 @@
 /* Reserved field names */
 #define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_NARGS "__probe_nargs"
 #define FIELD_STRING_RETIP "__probe_ret_ip"
 #define FIELD_STRING_FUNC "__probe_func"
@@ -52,61 +53,102 @@ const char *reserved_field_names[] = {
        "common_tgid",
        "common_lock_depth",
        FIELD_STRING_IP,
-        FIELD_STRING_NARGS,
        FIELD_STRING_RETIP,
        FIELD_STRING_FUNC,
 };
-struct fetch_func {
+/* Printing function type */
-        unsigned long (*func)(struct pt_regs *, void *);
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
+#define PRINT_TYPE_FUNC_NAME(type)      print_type_##type
+#define PRINT_TYPE_FMT_NAME(type)       print_type_format_##type
+/* Printing  in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)                   \
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,    \
+                                                const char *name, void *data)\
+{                                                                       \
+        return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+}                                                                       \
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+/* Data fetch function type */
+typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
+struct fetch_param {
+        fetch_func_t    fn;
        void *data;
 };
-static __kprobes unsigned long call_fetch(struct fetch_func *f,
+static __kprobes void call_fetch(struct fetch_param *fprm,
-                                          struct pt_regs *regs)
+                                 struct pt_regs *regs, void *dest)
-{
-        return f->func(regs, f->data);
-}
-/* fetch handlers */
-static __kprobes unsigned long fetch_register(struct pt_regs *regs,
-                                              void *offset)
-{
-        return regs_get_register(regs, (unsigned int)((unsigned long)offset));
-}
-static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
-                                           void *num)
 {
-        return regs_get_kernel_stack_nth(regs,
+        return fprm->fn(regs, fprm->data, dest);
-                                         (unsigned int)((unsigned long)num));
 }
-static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+#define FETCH_FUNC_NAME(kind, type)     fetch_##kind##_##type
-{
+/*
-        unsigned long retval;
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
-        if (probe_kernel_address(addr, retval))
+ */
-                return 0;
+#define DEFINE_BASIC_FETCH_FUNCS(kind)  \
-        return retval;
+DEFINE_FETCH_##kind(u8)                 \
+DEFINE_FETCH_##kind(u16)                \
+DEFINE_FETCH_##kind(u32)                \
+DEFINE_FETCH_##kind(u64)
+#define CHECK_BASIC_FETCH_FUNCS(kind, fn)       \
+        ((FETCH_FUNC_NAME(kind, u8) == fn) ||   \
+         (FETCH_FUNC_NAME(kind, u16) == fn) ||  \
+         (FETCH_FUNC_NAME(kind, u32) == fn) ||  \
+         (FETCH_FUNC_NAME(kind, u64) == fn))
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type)                                          \
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,  \
+                                          void *offset, void *dest)     \
+{                                                                       \
+        *(type *)dest = (type)regs_get_register(regs,                   \
+                                (unsigned int)((unsigned long)offset)); \
 }
+DEFINE_BASIC_FETCH_FUNCS(reg)
-static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
-{
+#define DEFINE_FETCH_stack(type)                                        \
-        return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+                                          void *offset, void *dest)     \
+{                                                                       \
+        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
+                                (unsigned int)((unsigned long)offset)); \
 }
+DEFINE_BASIC_FETCH_FUNCS(stack)
-static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+#define DEFINE_FETCH_retval(type)                                       \
-                                              void *dummy)
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
-{
+                                          void *dummy, void *dest)      \
-        return regs_return_value(regs);
+{                                                                       \
+        *(type *)dest = (type)regs_return_value(regs);                  \
 }
+DEFINE_BASIC_FETCH_FUNCS(retval)
-static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
-                                                   void *dummy)
+#define DEFINE_FETCH_memory(type)                                       \
-{
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
-        return kernel_stack_pointer(regs);
+                                          void *addr, void *dest)       \
+{                                                                       \
+        type retval;                                                    \
+        if (probe_kernel_address(addr, retval))                         \
+                *(type *)dest = 0;                                      \
+        else                                                            \
+                *(type *)dest = retval;                                 \
 }
+DEFINE_BASIC_FETCH_FUNCS(memory)
 /* Memory fetching by symbol */
 struct symbol_cache {
@@ -150,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
        return sc;
 }
-static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+#define DEFINE_FETCH_symbol(type)                                       \
-{
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
-        struct symbol_cache *sc = data;
+                                          void *data, void *dest)       \
+{                                                                       \
-        if (sc->addr)
+        struct symbol_cache *sc = data;                                 \
-                return fetch_memory(regs, (void *)sc->addr);
+        if (sc->addr)                                                   \
-        else
+                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
-                return 0;
+        else                                                            \
+                *(type *)dest = 0;                                      \
 }
+DEFINE_BASIC_FETCH_FUNCS(symbol)
-/* Special indirect memory access interface */
+/* Dereference memory access function */
-struct indirect_fetch_data {
+struct deref_fetch_param {
-        struct fetch_func orig;
+        struct fetch_param orig;
        long offset;
 };
-static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+#define DEFINE_FETCH_deref(type)                                        \
-{
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
-        struct indirect_fetch_data *ind = data;
+                                            void *data, void *dest)     \
-        unsigned long addr;
+{                                                                       \
+        struct deref_fetch_param *dprm = data;                          \
-        addr = call_fetch(&ind->orig, regs);
+        unsigned long addr;                                             \
-        if (addr) {
+        call_fetch(&dprm->orig, regs, &addr);                           \
-                addr += ind->offset;
+        if (addr) {                                                     \
-                return fetch_memory(regs, (void *)addr);
+                addr += dprm->offset;                                   \
-        } else
+                fetch_memory_##type(regs, (void *)addr, dest);          \
-                return 0;
+        } else                                                          \
+                *(type *)dest = 0;                                      \
 }
+DEFINE_BASIC_FETCH_FUNCS(deref)
-static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
-        if (data->orig.func == fetch_indirect)
+        if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
-                free_indirect_fetch_data(data->orig.data);
+                free_deref_fetch_param(data->orig.data);
-        else if (data->orig.func == fetch_symbol)
+        else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
                free_symbol_cache(data->orig.data);
        kfree(data);
 }
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+#define ASSIGN_FETCH_FUNC(kind, type)   \
+        .kind = FETCH_FUNC_NAME(kind, type)
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)   \
+        {.name = #ptype,                        \
+         .size = sizeof(ftype),                 \
+         .is_signed = sign,                     \
+         .print = PRINT_TYPE_FUNC_NAME(ptype),  \
+         .fmt = PRINT_TYPE_FMT_NAME(ptype),     \
+ASSIGN_FETCH_FUNC(reg, ftype),                  \
+ASSIGN_FETCH_FUNC(stack, ftype),                \
+ASSIGN_FETCH_FUNC(retval, ftype),               \
+ASSIGN_FETCH_FUNC(memory, ftype),               \
+ASSIGN_FETCH_FUNC(symbol, ftype),               \
+ASSIGN_FETCH_FUNC(deref, ftype),                \
+        }
+/* Fetch type information table */
+static const struct fetch_type {
+        const char      *name;          /* Name of type */
+        size_t          size;           /* Byte size of type */
+        int             is_signed;      /* Signed flag */
+        print_type_func_t       print;  /* Print functions */
+        const char      *fmt;           /* Fromat string */
+        /* Fetch functions */
+        fetch_func_t    reg;
+        fetch_func_t    stack;
+        fetch_func_t    retval;
+        fetch_func_t    memory;
+        fetch_func_t    symbol;
+        fetch_func_t    deref;
+} fetch_type_table[] = {
+        ASSIGN_FETCH_TYPE(u8,  u8,  0),
+        ASSIGN_FETCH_TYPE(u16, u16, 0),
+        ASSIGN_FETCH_TYPE(u32, u32, 0),
+        ASSIGN_FETCH_TYPE(u64, u64, 0),
+        ASSIGN_FETCH_TYPE(s8,  u8,  1),
+        ASSIGN_FETCH_TYPE(s16, u16, 1),
+        ASSIGN_FETCH_TYPE(s32, u32, 1),
+        ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+        int i;
+        if (!type)
+                type = DEFAULT_FETCH_TYPE_STR;
+        for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+                if (strcmp(type, fetch_type_table[i].name) == 0)
+                        return &fetch_type_table[i];
+        return NULL;
+}
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+                                          void *dummy, void *dest)
+{
+        *(unsigned long *)dest = kernel_stack_pointer(regs);
+}
 /**
 * Kprobe event core functions
 */
 struct probe_arg {
-        struct fetch_func       fetch;
+        struct fetch_param      fetch;
-        const char              *name;
+        unsigned int            offset; /* Offset from argument entry */
+        const char              *name;  /* Name of this argument */
+        const char              *comm;  /* Command of this argument */
+        const struct fetch_type *type;  /* Type of this argument */
 };
 /* Flags for trace_probe */
@@ -207,8 +324,9 @@ struct trace_probe {
        unsigned long           nhit;
        unsigned int            flags;  /* For TP_FLAG_* */
        const char              *symbol;        /* symbol name */
+        struct ftrace_event_class       class;
        struct ftrace_event_call        call;
-        struct trace_event              event;
+        ssize_t                 size;           /* trace entry size */
        unsigned int            nr_args;
        struct probe_arg        args[];
 };
@@ -217,6 +335,7 @@ struct trace_probe {
        (offsetof(struct trace_probe, args) +   \
        (sizeof(struct probe_arg) * (n)))
 static __kprobes int probe_is_return(struct trace_probe *tp)
 {
        return tp->rp.handler != NULL;
@@ -227,51 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
        return tp->symbol ? tp->symbol : "unknown";
 }
-static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
-{
-        int ret = -EINVAL;
-        if (ff->func == fetch_argument)
-                ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
-        else if (ff->func == fetch_register) {
-                const char *name;
-                name = regs_query_register_name((unsigned int)((long)ff->data));
-                ret = snprintf(buf, n, "%%%s", name);
-        } else if (ff->func == fetch_stack)
-                ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
-        else if (ff->func == fetch_memory)
-                ret = snprintf(buf, n, "@0x%p", ff->data);
-        else if (ff->func == fetch_symbol) {
-                struct symbol_cache *sc = ff->data;
-                if (sc->offset)
-                        ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
-                                        sc->offset);
-                else
-                        ret = snprintf(buf, n, "@%s", sc->symbol);
-        } else if (ff->func == fetch_retvalue)
-                ret = snprintf(buf, n, "$retval");
-        else if (ff->func == fetch_stack_address)
-                ret = snprintf(buf, n, "$stack");
-        else if (ff->func == fetch_indirect) {
-                struct indirect_fetch_data *id = ff->data;
-                size_t l = 0;
-                ret = snprintf(buf, n, "%+ld(", id->offset);
-                if (ret >= n)
-                        goto end;
-                l += ret;
-                ret = probe_arg_string(buf + l, n - l, &id->orig);
-                if (ret < 0)
-                        goto end;
-                l += ret;
-                ret = snprintf(buf + l, n - l, ")");
-                ret += l;
-        }
-end:
-        if (ret >= n)
-                return -ENOSPC;
-        return ret;
-}
 static int register_probe_event(struct trace_probe *tp);
 static void unregister_probe_event(struct trace_probe *tp);
@@ -330,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        }
+        tp->call.class = &tp->class;
        tp->call.name = kstrdup(event, GFP_KERNEL);
        if (!tp->call.name)
                goto error;
@@ -339,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        }
-        tp->call.system = kstrdup(group, GFP_KERNEL);
+        tp->class.system = kstrdup(group, GFP_KERNEL);
-        if (!tp->call.system)
+        if (!tp->class.system)
                goto error;
        INIT_LIST_HEAD(&tp->list);
@@ -354,11 +429,12 @@ error:
 static void free_probe_arg(struct probe_arg *arg)
 {
-        if (arg->fetch.func == fetch_symbol)
+        if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
+                free_deref_fetch_param(arg->fetch.data);
+        else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
                free_symbol_cache(arg->fetch.data);
-        else if (arg->fetch.func == fetch_indirect)
-                free_indirect_fetch_data(arg->fetch.data);
        kfree(arg->name);
+        kfree(arg->comm);
 }
 static void free_trace_probe(struct trace_probe *tp)
@@ -368,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
        for (i = 0; i < tp->nr_args; i++)
                free_probe_arg(&tp->args[i]);
-        kfree(tp->call.system);
+        kfree(tp->call.class->system);
        kfree(tp->call.name);
        kfree(tp->symbol);
        kfree(tp);
@@ -381,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
        list_for_each_entry(tp, &probe_list, list)
                if (strcmp(tp->call.name, event) == 0 &&
-                    strcmp(tp->call.system, group) == 0)
+                    strcmp(tp->call.class->system, group) == 0)
                        return tp;
        return NULL;
 }
@@ -406,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
        mutex_lock(&probe_lock);
        /* register as an event */
-        old_tp = find_probe_event(tp->call.name, tp->call.system);
+        old_tp = find_probe_event(tp->call.name, tp->call.class->system);
        if (old_tp) {
                /* delete old event */
                unregister_trace_probe(old_tp);
@@ -464,46 +540,41 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
 #define PARAM_MAX_ARGS 16
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
-static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+                            struct fetch_param *f, int is_return)
 {
        int ret = 0;
        unsigned long param;
        if (strcmp(arg, "retval") == 0) {
-                if (is_return) {
+                if (is_return)
-                        ff->func = fetch_retvalue;
+                        f->fn = t->retval;
-                        ff->data = NULL;
+                else
-                } else
                        ret = -EINVAL;
        } else if (strncmp(arg, "stack", 5) == 0) {
                if (arg[5] == '\0') {
-                        ff->func = fetch_stack_address;
+                        if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
-                        ff->data = NULL;
+                                f->fn = fetch_stack_address;
+                        else
+                                ret = -EINVAL;
                } else if (isdigit(arg[5])) {
                        ret = strict_strtoul(arg + 5, 10, &param);
                        if (ret || param > PARAM_MAX_STACK)
                                ret = -EINVAL;
                        else {
-                                ff->func = fetch_stack;
+                                f->fn = t->stack;
-                                ff->data = (void *)param;
+                                f->data = (void *)param;
                        }
                } else
                        ret = -EINVAL;
-        } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
-                ret = strict_strtoul(arg + 3, 10, &param);
-                if (ret || param > PARAM_MAX_ARGS)
-                        ret = -EINVAL;
-                else {
-                        ff->func = fetch_argument;
-                        ff->data = (void *)param;
-                }
        } else
                ret = -EINVAL;
        return ret;
 }
 /* Recursive argument parser */
-static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int __parse_probe_arg(char *arg, const struct fetch_type *t,
+                             struct fetch_param *f, int is_return)
 {
        int ret = 0;
        unsigned long param;
@@ -512,13 +583,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
        switch (arg[0]) {
        case '$':
-                ret = parse_probe_vars(arg + 1, ff, is_return);
+                ret = parse_probe_vars(arg + 1, t, f, is_return);
                break;
        case '%':       /* named register */
                ret = regs_query_register_offset(arg + 1);
                if (ret >= 0) {
-                        ff->func = fetch_register;
+                        f->fn = t->reg;
-                        ff->data = (void *)(unsigned long)ret;
+                        f->data = (void *)(unsigned long)ret;
                        ret = 0;
                }
                break;
@@ -527,26 +598,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
                        ret = strict_strtoul(arg + 1, 0, &param);
                        if (ret)
                                break;
-                        ff->func = fetch_memory;
+                        f->fn = t->memory;
-                        ff->data = (void *)param;
+                        f->data = (void *)param;
                } else {
                        ret = split_symbol_offset(arg + 1, &offset);
                        if (ret)
                                break;
-                        ff->data = alloc_symbol_cache(arg + 1, offset);
+                        f->data = alloc_symbol_cache(arg + 1, offset);
-                        if (ff->data)
+                        if (f->data)
-                                ff->func = fetch_symbol;
+                                f->fn = t->symbol;
-                        else
-                                ret = -EINVAL;
                }
                break;
-        case '+':       /* indirect memory */
+        case '+':       /* deref memory */
        case '-':
                tmp = strchr(arg, '(');
-                if (!tmp) {
+                if (!tmp)
-                        ret = -EINVAL;
                        break;
-                }
                *tmp = '\0';
                ret = strict_strtol(arg + 1, 0, &offset);
                if (ret)
@@ -556,38 +623,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
                arg = tmp + 1;
                tmp = strrchr(arg, ')');
                if (tmp) {
-                        struct indirect_fetch_data *id;
+                        struct deref_fetch_param *dprm;
+                        const struct fetch_type *t2 = find_fetch_type(NULL);
                        *tmp = '\0';
-                        id = kzalloc(sizeof(struct indirect_fetch_data),
+                        dprm = kzalloc(sizeof(struct deref_fetch_param),
-                                     GFP_KERNEL);
+                                       GFP_KERNEL);
-                        if (!id)
+                        if (!dprm)
                                return -ENOMEM;
-                        id->offset = offset;
+                        dprm->offset = offset;
-                        ret = __parse_probe_arg(arg, &id->orig, is_return);
+                        ret = __parse_probe_arg(arg, t2, &dprm->orig,
+                                                is_return);
                        if (ret)
-                                kfree(id);
+                                kfree(dprm);
                        else {
-                                ff->func = fetch_indirect;
+                                f->fn = t->deref;
-                                ff->data = (void *)id;
+                                f->data = (void *)dprm;
                        }
-                } else
+                }
-                        ret = -EINVAL;
                break;
-        default:
-                /* TODO: support custom handler */
-                ret = -EINVAL;
        }
+        if (!ret && !f->fn)
+                ret = -EINVAL;
        return ret;
 }
 /* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_arg(char *arg, struct trace_probe *tp,
+                           struct probe_arg *parg, int is_return)
 {
+        const char *t;
        if (strlen(arg) > MAX_ARGSTR_LEN) {
                pr_info("Argument is too long.: %s\n",  arg);
                return -ENOSPC;
        }
-        return __parse_probe_arg(arg, ff, is_return);
+        parg->comm = kstrdup(arg, GFP_KERNEL);
+        if (!parg->comm) {
+                pr_info("Failed to allocate memory for command '%s'.\n", arg);
+                return -ENOMEM;
+        }
+        t = strchr(parg->comm, ':');
+        if (t) {
+                arg[t - parg->comm] = '\0';
+                t++;
+        }
+        parg->type = find_fetch_type(t);
+        if (!parg->type) {
+                pr_info("Unsupported type: %s\n", t);
+                return -EINVAL;
+        }
+        parg->offset = tp->size;
+        tp->size += parg->type->size;
+        return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
 }
 /* Return 1 if name is reserved or already used by another argument */
@@ -611,22 +698,24 @@ static int create_trace_probe(int argc, char **argv)
         *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
         *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
         * Fetch args:
-         *  $argN       : fetch Nth of function argument. (N:0-)
         *  $retval     : fetch return value
         *  $stack      : fetch stack address
         *  $stackN     : fetch Nth of stack (N:0-)
         *  @ADDR       : fetch memory at ADDR (ADDR should be in kernel)
         *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
         *  %REG        : fetch register REG
-         * Indirect memory fetch:
+         * Dereferencing memory fetch:
         *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
         * Alias name of args:
         *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+         * Type of args:
+         *  FETCHARG:TYPE : use TYPE instead of unsigned long.
         */
        struct trace_probe *tp;
        int i, ret = 0;
        int is_return = 0, is_delete = 0;
-        char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+        char *symbol = NULL, *event = NULL, *group = NULL;
+        char *arg, *tmp;
        unsigned long offset = 0;
        void *addr = NULL;
        char buf[MAX_EVENT_NAME_LEN];
@@ -651,12 +740,12 @@ static int create_trace_probe(int argc, char **argv)
                        event = strchr(group, '/') + 1;
                        event[-1] = '\0';
                        if (strlen(group) == 0) {
-                                pr_info("Group name is not specifiled\n");
+                                pr_info("Group name is not specified\n");
                                return -EINVAL;
                        }
                }
                if (strlen(event) == 0) {
-                        pr_info("Event name is not specifiled\n");
+                        pr_info("Event name is not specified\n");
                        return -EINVAL;
                }
        }
@@ -689,7 +778,7 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                /* an address specified */
-                ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+                ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
                        pr_info("Failed to parse address.\n");
                        return ret;
@@ -739,13 +828,6 @@ static int create_trace_probe(int argc, char **argv)
                else
                        arg = argv[i];
-                if (conflict_field_name(argv[i], tp->args, i)) {
-                        pr_info("Argument%d name '%s' conflicts with "
-                                "another field.\n", i, argv[i]);
-                        ret = -EINVAL;
-                        goto error;
-                }
                tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
                if (!tp->args[i].name) {
                        pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -753,9 +835,19 @@ static int create_trace_probe(int argc, char **argv)
                        ret = -ENOMEM;
                        goto error;
                }
+                tmp = strchr(tp->args[i].name, ':');
+                if (tmp)
+                        *tmp = '_';     /* convert : to _ */
+                if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+                        pr_info("Argument%d name '%s' conflicts with "
+                                "another field.\n", i, argv[i]);
+                        ret = -EINVAL;
+                        goto error;
+                }
                /* Parse fetch argument */
-                ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+                ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
                if (ret) {
                        pr_info("Parse error at argument%d. (%d)\n", i, ret);
                        kfree(tp->args[i].name);
@@ -810,11 +902,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
        struct trace_probe *tp = v;
-        int i, ret;
+        int i;
-        char buf[MAX_ARGSTR_LEN + 1];
        seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
-        seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+        seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
        if (!tp->symbol)
                seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -823,15 +914,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
        else
                seq_printf(m, " %s", probe_symbol(tp));
-        for (i = 0; i < tp->nr_args; i++) {
+        for (i = 0; i < tp->nr_args; i++)
-                ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
+                seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
-                if (ret < 0) {
-                        pr_warning("Argument%d decoding error(%d).\n", i, ret);
-                        return ret;
-                }
-                seq_printf(m, " %s=%s", tp->args[i].name, buf);
-        }
        seq_printf(m, "\n");
        return 0;
 }
@@ -958,12 +1044,13 @@ static const struct file_operations kprobe_profile_ops = {
 };
 /* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
-        struct kprobe_trace_entry *entry;
+        struct kprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
+        u8 *data;
        int size, i, pc;
        unsigned long irq_flags;
        struct ftrace_event_call *call = &tp->call;
@@ -973,32 +1060,32 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        local_save_flags(irq_flags);
        pc = preempt_count();
-        size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+        size = sizeof(*entry) + tp->size;
-        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-                                                  irq_flags, pc);
+                                                  size, irq_flags, pc);
        if (!event)
-                return 0;
+                return;
        entry = ring_buffer_event_data(event);
-        entry->nargs = tp->nr_args;
        entry->ip = (unsigned long)kp->addr;
+        data = (u8 *)&entry[1];
        for (i = 0; i < tp->nr_args; i++)
-                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-        return 0;
 }
 /* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
                                          struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
-        struct kretprobe_trace_entry *entry;
+        struct kretprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
+        u8 *data;
        int size, i, pc;
        unsigned long irq_flags;
        struct ftrace_event_call *call = &tp->call;
@@ -1006,39 +1093,37 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
        local_save_flags(irq_flags);
        pc = preempt_count();
-        size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+        size = sizeof(*entry) + tp->size;
-        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-                                                  irq_flags, pc);
+                                                  size, irq_flags, pc);
        if (!event)
-                return 0;
+                return;
        entry = ring_buffer_event_data(event);
-        entry->nargs = tp->nr_args;
        entry->func = (unsigned long)tp->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
+        data = (u8 *)&entry[1];
        for (i = 0; i < tp->nr_args; i++)
-                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-        return 0;
 }
 /* Event entry printers */
 enum print_line_t
-print_kprobe_event(struct trace_iterator *iter, int flags)
+print_kprobe_event(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
 {
-        struct kprobe_trace_entry *field;
+        struct kprobe_trace_entry_head *field;
        struct trace_seq *s = &iter->seq;
-        struct trace_event *event;
        struct trace_probe *tp;
+        u8 *data;
        int i;
-        field = (struct kprobe_trace_entry *)iter->ent;
+        field = (struct kprobe_trace_entry_head *)iter->ent;
-        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, call.event);
-        tp = container_of(event, struct trace_probe, event);
        if (!trace_seq_printf(s, "%s: (", tp->call.name))
                goto partial;
@@ -1049,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
        if (!trace_seq_puts(s, ")"))
                goto partial;
-        for (i = 0; i < field->nargs; i++)
+        data = (u8 *)&field[1];
-                if (!trace_seq_printf(s, " %s=%lx",
+        for (i = 0; i < tp->nr_args; i++)
-                                      tp->args[i].name, field->args[i]))
+                if (!tp->args[i].type->print(s, tp->args[i].name,
+                                             data + tp->args[i].offset))
                        goto partial;
        if (!trace_seq_puts(s, "\n"))
@@ -1063,17 +1149,17 @@ partial:
 }
 enum print_line_t
-print_kretprobe_event(struct trace_iterator *iter, int flags)
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+                      struct trace_event *event)
 {
-        struct kretprobe_trace_entry *field;
+        struct kretprobe_trace_entry_head *field;
        struct trace_seq *s = &iter->seq;
-        struct trace_event *event;
        struct trace_probe *tp;
+        u8 *data;
        int i;
-        field = (struct kretprobe_trace_entry *)iter->ent;
+        field = (struct kretprobe_trace_entry_head *)iter->ent;
-        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, call.event);
-        tp = container_of(event, struct trace_probe, event);
        if (!trace_seq_printf(s, "%s: (", tp->call.name))
                goto partial;
@@ -1090,9 +1176,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
        if (!trace_seq_puts(s, ")"))
                goto partial;
-        for (i = 0; i < field->nargs; i++)
+        data = (u8 *)&field[1];
-                if (!trace_seq_printf(s, " %s=%lx",
+        for (i = 0; i < tp->nr_args; i++)
-                                      tp->args[i].name, field->args[i]))
+                if (!tp->args[i].type->print(s, tp->args[i].name,
+                                             data + tp->args[i].offset))
                        goto partial;
        if (!trace_seq_puts(s, "\n"))
@@ -1129,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
 static int probe_event_raw_init(struct ftrace_event_call *event_call)
 {
-        INIT_LIST_HEAD(&event_call->fields);
        return 0;
 }
@@ -1148,242 +1233,170 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
 static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
        int ret, i;
-        struct kprobe_trace_entry field;
+        struct kprobe_trace_entry_head field;
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
-        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
        /* Set argument names as fields */
-        for (i = 0; i < tp->nr_args; i++)
+        for (i = 0; i < tp->nr_args; i++) {
-                DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+                ret = trace_define_field(event_call, tp->args[i].type->name,
+                                         tp->args[i].name,
+                                         sizeof(field) + tp->args[i].offset,
+                                         tp->args[i].type->size,
+                                         tp->args[i].type->is_signed,
+                                         FILTER_OTHER);
+                if (ret)
+                        return ret;
+        }
        return 0;
 }
 static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
        int ret, i;
-        struct kretprobe_trace_entry field;
+        struct kretprobe_trace_entry_head field;
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
        DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
        DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
-        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
        /* Set argument names as fields */
-        for (i = 0; i < tp->nr_args; i++)
+        for (i = 0; i < tp->nr_args; i++) {
-                DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+                ret = trace_define_field(event_call, tp->args[i].type->name,
+                                         tp->args[i].name,
+                                         sizeof(field) + tp->args[i].offset,
+                                         tp->args[i].type->size,
+                                         tp->args[i].type->is_signed,
+                                         FILTER_OTHER);
+                if (ret)
+                        return ret;
+        }
        return 0;
 }
-static int __probe_event_show_format(struct trace_seq *s,
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
-                                     struct trace_probe *tp, const char *fmt,
-                                     const char *arg)
 {
        int i;
+        int pos = 0;
-        /* Show format */
+        const char *fmt, *arg;
-        if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
-                return 0;
-        for (i = 0; i < tp->nr_args; i++)
+        if (!probe_is_return(tp)) {
-                if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
+                fmt = "(%lx)";
-                        return 0;
+                arg = "REC->" FIELD_STRING_IP;
+        } else {
+                fmt = "(%lx <- %lx)";
+                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+        }
-        if (!trace_seq_printf(s, "\", %s", arg))
+        /* When len=0, we just calculate the needed length */
-                return 0;
+#define LEN_OR_ZERO (len ? len - pos : 0)
-        for (i = 0; i < tp->nr_args; i++)
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
-                if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
-                        return 0;
-        return trace_seq_puts(s, "\n");
+        for (i = 0; i < tp->nr_args; i++) {
-}
+                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+                                tp->args[i].name, tp->args[i].type->fmt);
+        }
-#undef SHOW_FIELD
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
-#define SHOW_FIELD(type, item, name)                                    \
-        do {                                                            \
-                ret = trace_seq_printf(s, "\tfield:" #type " %s;\t"     \
-                                "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
-                                (unsigned int)offsetof(typeof(field), item),\
-                                (unsigned int)sizeof(type),             \
-                                is_signed_type(type));                  \
-                if (!ret)                                               \
-                        return 0;                                       \
-        } while (0)
-static int kprobe_event_show_format(struct ftrace_event_call *call,
+        for (i = 0; i < tp->nr_args; i++) {
-                                    struct trace_seq *s)
+                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
-{
+                                tp->args[i].name);
-        struct kprobe_trace_entry field __attribute__((unused));
+        }
-        int ret, i;
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
+#undef LEN_OR_ZERO
-        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
-        /* Show fields */
+        /* return the length of print_fmt */
-        for (i = 0; i < tp->nr_args; i++)
+        return pos;
-                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
-        trace_seq_puts(s, "\n");
-        return __probe_event_show_format(s, tp, "(%lx)",
-                                         "REC->" FIELD_STRING_IP);
 }
-static int kretprobe_event_show_format(struct ftrace_event_call *call,
+static int set_print_fmt(struct trace_probe *tp)
-                                       struct trace_seq *s)
 {
-        struct kretprobe_trace_entry field __attribute__((unused));
+        int len;
-        int ret, i;
+        char *print_fmt;
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+        /* First: called with 0 length to calculate the needed length */
-        SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+        len = __set_print_fmt(tp, NULL, 0);
-        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
-        /* Show fields */
+        /* Second: actually write the @print_fmt */
-        for (i = 0; i < tp->nr_args; i++)
+        __set_print_fmt(tp, print_fmt, len + 1);
-                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+        tp->call.print_fmt = print_fmt;
-        trace_seq_puts(s, "\n");
-        return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+        return 0;
-                                         "REC->" FIELD_STRING_FUNC
-                                         ", REC->" FIELD_STRING_RETIP);
 }
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 /* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
                                         struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
        struct ftrace_event_call *call = &tp->call;
-        struct kprobe_trace_entry *entry;
+        struct kprobe_trace_entry_head *entry;
-        struct trace_entry *ent;
+        struct hlist_head *head;
-        int size, __size, i, pc, __cpu;
+        u8 *data;
-        unsigned long irq_flags;
+        int size, __size, i;
-        char *trace_buf;
-        char *raw_data;
        int rctx;
-        pc = preempt_count();
+        __size = sizeof(*entry) + tp->size;
-        __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
                     "profile buffer not large enough"))
-                return 0;
+                return;
-        /*
-         * Protect the non nmi buffer
-         * This also protects the rcu read side
-         */
-        local_irq_save(irq_flags);
-        rctx = perf_swevent_get_recursion_context();
-        if (rctx < 0)
-                goto end_recursion;
-        __cpu = smp_processor_id();
+        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+        if (!entry)
-        if (in_nmi())
+                return;
-                trace_buf = rcu_dereference(perf_trace_buf_nmi);
-        else
-                trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
-                goto end;
-        raw_data = per_cpu_ptr(trace_buf, __cpu);
-        /* Zero dead bytes from alignment to avoid buffer leak to userspace */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        entry = (struct kprobe_trace_entry *)raw_data;
-        ent = &entry->ent;
-        tracing_generic_entry_update(ent, irq_flags, pc);
-        ent->type = call->id;
-        entry->nargs = tp->nr_args;
        entry->ip = (unsigned long)kp->addr;
+        data = (u8 *)&entry[1];
        for (i = 0; i < tp->nr_args; i++)
-                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
-        perf_tp_event(call->id, entry->ip, 1, entry, size);
-end:
-        perf_swevent_put_recursion_context(rctx);
-end_recursion:
-        local_irq_restore(irq_flags);
-        return 0;
+        head = this_cpu_ptr(call->perf_events);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
 }
 /* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
                                            struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
        struct ftrace_event_call *call = &tp->call;
-        struct kretprobe_trace_entry *entry;
+        struct kretprobe_trace_entry_head *entry;
-        struct trace_entry *ent;
+        struct hlist_head *head;
-        int size, __size, i, pc, __cpu;
+        u8 *data;
-        unsigned long irq_flags;
+        int size, __size, i;
-        char *trace_buf;
-        char *raw_data;
        int rctx;
-        pc = preempt_count();
+        __size = sizeof(*entry) + tp->size;
-        __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
                     "profile buffer not large enough"))
-                return 0;
+                return;
-        /*
-         * Protect the non nmi buffer
-         * This also protects the rcu read side
-         */
-        local_irq_save(irq_flags);
-        rctx = perf_swevent_get_recursion_context();
+        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
-        if (rctx < 0)
+        if (!entry)
-                goto end_recursion;
+                return;
-        __cpu = smp_processor_id();
-        if (in_nmi())
-                trace_buf = rcu_dereference(perf_trace_buf_nmi);
-        else
-                trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
-                goto end;
-        raw_data = per_cpu_ptr(trace_buf, __cpu);
-        /* Zero dead bytes from alignment to avoid buffer leak to userspace */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        entry = (struct kretprobe_trace_entry *)raw_data;
-        ent = &entry->ent;
-        tracing_generic_entry_update(ent, irq_flags, pc);
-        ent->type = call->id;
-        entry->nargs = tp->nr_args;
        entry->func = (unsigned long)tp->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
+        data = (u8 *)&entry[1];
        for (i = 0; i < tp->nr_args; i++)
-                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+                call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
-        perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
-end:
-        perf_swevent_put_recursion_context(rctx);
-end_recursion:
-        local_irq_restore(irq_flags);
-        return 0;
+        head = this_cpu_ptr(call->perf_events);
+        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
 }
-static int probe_profile_enable(struct ftrace_event_call *call)
+static int probe_perf_enable(struct ftrace_event_call *call)
 {
        struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1395,7 +1408,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
                return enable_kprobe(&tp->rp.kp);
 }
-static void probe_profile_disable(struct ftrace_event_call *call)
+static void probe_perf_disable(struct ftrace_event_call *call)
 {
        struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1408,8 +1421,28 @@ static void probe_profile_disable(struct ftrace_event_call *call)
                        disable_kprobe(&tp->rp.kp);
        }
 }
-#endif  /* CONFIG_EVENT_PROFILE */
+#endif  /* CONFIG_PERF_EVENTS */
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return probe_event_enable(event);
+        case TRACE_REG_UNREGISTER:
+                probe_event_disable(event);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return probe_perf_enable(event);
+        case TRACE_REG_PERF_UNREGISTER:
+                probe_perf_disable(event);
+                return 0;
+#endif
+        }
+        return 0;
+}
 static __kprobes
 int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1418,10 +1451,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
        if (tp->flags & TP_FLAG_TRACE)
                kprobe_trace_func(kp, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
-                kprobe_profile_func(kp, regs);
+                kprobe_perf_func(kp, regs);
-#endif  /* CONFIG_EVENT_PROFILE */
+#endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1432,13 +1465,21 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
        if (tp->flags & TP_FLAG_TRACE)
                kretprobe_trace_func(ri, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
-                kretprobe_profile_func(ri, regs);
+                kretprobe_perf_func(ri, regs);
-#endif  /* CONFIG_EVENT_PROFILE */
+#endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
+static struct trace_event_functions kretprobe_funcs = {
+        .trace          = print_kretprobe_event
+};
+static struct trace_event_functions kprobe_funcs = {
+        .trace          = print_kprobe_event
+};
 static int register_probe_event(struct trace_probe *tp)
 {
        struct ftrace_event_call *call = &tp->call;
@@ -1446,33 +1487,31 @@ static int register_probe_event(struct trace_probe *tp)
        /* Initialize ftrace_event_call */
        if (probe_is_return(tp)) {
-                tp->event.trace = print_kretprobe_event;
+                INIT_LIST_HEAD(&call->class->fields);
-                call->raw_init = probe_event_raw_init;
+                call->event.funcs = &kretprobe_funcs;
-                call->show_format = kretprobe_event_show_format;
+                call->class->raw_init = probe_event_raw_init;
-                call->define_fields = kretprobe_event_define_fields;
+                call->class->define_fields = kretprobe_event_define_fields;
        } else {
-                tp->event.trace = print_kprobe_event;
+                INIT_LIST_HEAD(&call->class->fields);
-                call->raw_init = probe_event_raw_init;
+                call->event.funcs = &kprobe_funcs;
-                call->show_format = kprobe_event_show_format;
+                call->class->raw_init = probe_event_raw_init;
-                call->define_fields = kprobe_event_define_fields;
+                call->class->define_fields = kprobe_event_define_fields;
        }
-        call->event = &tp->event;
+        if (set_print_fmt(tp) < 0)
-        call->id = register_ftrace_event(&tp->event);
+                return -ENOMEM;
-        if (!call->id)
+        ret = register_ftrace_event(&call->event);
+        if (!ret) {
+                kfree(call->print_fmt);
                return -ENODEV;
-        call->enabled = 0;
+        }
-        call->regfunc = probe_event_enable;
+        call->flags = 0;
-        call->unregfunc = probe_event_disable;
+        call->class->reg = kprobe_register;
-#ifdef CONFIG_EVENT_PROFILE
-        call->profile_enable = probe_profile_enable;
-        call->profile_disable = probe_profile_disable;
-#endif
        call->data = tp;
        ret = trace_add_event_call(call);
        if (ret) {
                pr_info("Failed to register kprobe event: %s\n", call->name);
-                unregister_ftrace_event(&tp->event);
+                kfree(call->print_fmt);
+                unregister_ftrace_event(&call->event);
        }
        return ret;
 }
@@ -1481,6 +1520,7 @@ static void unregister_probe_event(struct trace_probe *tp)
 {
        /* tp->event is unregistered in trace_remove_event_call() */
        trace_remove_event_call(&tp->call);
+        kfree(tp->call.print_fmt);
 }
 /* Make a debugfs interface for controling probe points */
@@ -1523,28 +1563,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
 static __init int kprobe_trace_self_tests_init(void)
 {
-        int ret;
+        int ret, warn = 0;
        int (*target)(int, int, int, int, int, int);
+        struct trace_probe *tp;
        target = kprobe_trace_selftest_target;
        pr_info("Testing kprobe tracing: ");
        ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
-                                  "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
+                                  "$stack $stack0 +0($stack)");
-        if (WARN_ON_ONCE(ret))
+        if (WARN_ON_ONCE(ret)) {
-                pr_warning("error enabling function entry\n");
+                pr_warning("error on probing function entry.\n");
+                warn++;
+        } else {
+                /* Enable trace point */
+                tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+                if (WARN_ON_ONCE(tp == NULL)) {
+                        pr_warning("error on getting new probe.\n");
+                        warn++;
+                } else
+                        probe_event_enable(&tp->call);
+        }
        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
                                  "$retval");
-        if (WARN_ON_ONCE(ret))
+        if (WARN_ON_ONCE(ret)) {
-                pr_warning("error enabling function return\n");
+                pr_warning("error on probing function return.\n");
+                warn++;
+        } else {
+                /* Enable trace point */
+                tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+                if (WARN_ON_ONCE(tp == NULL)) {
+                        pr_warning("error on getting new probe.\n");
+                        warn++;
+                } else
+                        probe_event_enable(&tp->call);
+        }
+        if (warn)
+                goto end;
        ret = target(1, 2, 3, 4, 5, 6);
-        cleanup_all_probes();
+        ret = command_trace_probe("-:testprobe");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on deleting a probe.\n");
+                warn++;
+        }
+        ret = command_trace_probe("-:testprobe2");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on deleting a probe.\n");
+                warn++;
+        }
-        pr_cont("OK\n");
+end:
+        cleanup_all_probes();
+        if (warn)
+                pr_cont("NG: Some tests are failed. Please check them.\n");
+        else
+                pr_cont("OK\n");
        return 0;
 }
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 94103cdcf9d8..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -23,6 +23,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include "trace_output.h"
@@ -33,12 +34,6 @@
 #include <asm/atomic.h>
-/*
- * For now, let us restrict the no. of symbols traced simultaneously to number
- * of available hardware breakpoint registers.
- */
-#define KSYM_TRACER_MAX HBP_NUM
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
 struct trace_ksym {
@@ -52,7 +47,6 @@ struct trace_ksym {
 static struct trace_array *ksym_trace_array;
-static unsigned int ksym_filter_entry_count;
 static unsigned int ksym_tracing_enabled;
 static HLIST_HEAD(ksym_filter_head);
@@ -180,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
        struct trace_ksym *entry;
        int ret = -ENOMEM;
-        if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
-                printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
-                " new requests for tracing can be accepted now.\n",
-                        KSYM_TRACER_MAX);
-                return -ENOSPC;
-        }
        entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
        if (!entry)
                return -ENOMEM;
@@ -202,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
        if (IS_ERR(entry->ksym_hbp)) {
                ret = PTR_ERR(entry->ksym_hbp);
-                printk(KERN_INFO "ksym_tracer request failed. Try again"
+                if (ret == -ENOSPC) {
-                                        " later!!\n");
+                        printk(KERN_ERR "ksym_tracer: Maximum limit reached."
+                        " No new requests for tracing can be accepted now.\n");
+                } else {
+                        printk(KERN_INFO "ksym_tracer request failed. Try again"
+                                         " later!!\n");
+                }
                goto err;
        }
        hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-        ksym_filter_entry_count++;
        return 0;
@@ -264,7 +255,6 @@ static void __ksym_trace_reset(void)
        hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
                                                                ksym_hlist) {
                unregister_wide_hw_breakpoint(entry->ksym_hbp);
-                ksym_filter_entry_count--;
                hlist_del_rcu(&(entry->ksym_hlist));
                synchronize_rcu();
                kfree(entry);
@@ -337,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
                                goto out_unlock;
                }
                /* Error or "symbol:---" case: drop it */
-                ksym_filter_entry_count--;
                hlist_del_rcu(&(entry->ksym_hlist));
                synchronize_rcu();
                kfree(entry);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <asm/atomic.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..57c1b4596470 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
        return 1;
 }
+EXPORT_SYMBOL(trace_seq_putc);
 int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
@@ -253,7 +254,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
        void *ret;
        if (s->full)
-                return 0;
+                return NULL;
        if (len > ((PAGE_SIZE - 1) - s->len)) {
                s->full = 1;
@@ -355,6 +356,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
+const char *
+ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+{
+        int i;
+        const char *ret = p->buffer + p->len;
+        for (i = 0; i < buf_len; i++)
+                trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL(ftrace_print_hex_seq);
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
@@ -726,6 +742,9 @@ int register_ftrace_event(struct trace_event *event)
        if (WARN_ON(!event))
                goto out;
+        if (WARN_ON(!event->funcs))
+                goto out;
        INIT_LIST_HEAD(&event->list);
        if (!event->type) {
@@ -758,14 +777,14 @@ int register_ftrace_event(struct trace_event *event)
                        goto out;
        }
-        if (event->trace == NULL)
+        if (event->funcs->trace == NULL)
-                event->trace = trace_nop_print;
+                event->funcs->trace = trace_nop_print;
-        if (event->raw == NULL)
+        if (event->funcs->raw == NULL)
-                event->raw = trace_nop_print;
+                event->funcs->raw = trace_nop_print;
-        if (event->hex == NULL)
+        if (event->funcs->hex == NULL)
-                event->hex = trace_nop_print;
+                event->funcs->hex = trace_nop_print;
-        if (event->binary == NULL)
+        if (event->funcs->binary == NULL)
-                event->binary = trace_nop_print;
+                event->funcs->binary = trace_nop_print;
        key = event->type & (EVENT_HASHSIZE - 1);
@@ -807,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 * Standard events
 */
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+                                  struct trace_event *event)
 {
        return TRACE_TYPE_HANDLED;
 }
 /* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -840,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        struct ftrace_entry *field;
@@ -854,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -867,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -880,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
-static struct trace_event trace_fn_event = {
+static struct trace_event_functions trace_fn_funcs = {
-        .type           = TRACE_FN,
        .trace          = trace_fn_trace,
        .raw            = trace_fn_raw,
        .hex            = trace_fn_hex,
        .binary         = trace_fn_bin,
 };
+static struct trace_event trace_fn_event = {
+        .type           = TRACE_FN,
+        .funcs          = &trace_fn_funcs,
+};
 /* TRACE_CTX an TRACE_WAKE */
 static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
                                             char *delim)
@@ -916,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+                                         struct trace_event *event)
 {
        return trace_ctxwake_print(iter, "==>");
 }
 static enum print_line_t trace_wake_print(struct trace_iterator *iter,
-                                          int flags)
+                                          int flags, struct trace_event *event)
 {
        return trace_ctxwake_print(iter, "  +");
 }
@@ -950,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
 {
        return trace_ctxwake_raw(iter, 0);
 }
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        return trace_ctxwake_raw(iter, '+');
 }
@@ -984,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
 {
        return trace_ctxwake_hex(iter, 0);
 }
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        return trace_ctxwake_hex(iter, '+');
 }
 static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct ctx_switch_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1012,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
-static struct trace_event trace_ctx_event = {
+static struct trace_event_functions trace_ctx_funcs = {
-        .type           = TRACE_CTX,
        .trace          = trace_ctx_print,
        .raw            = trace_ctx_raw,
        .hex            = trace_ctx_hex,
        .binary         = trace_ctxwake_bin,
 };
-static struct trace_event trace_wake_event = {
+static struct trace_event trace_ctx_event = {
-        .type           = TRACE_WAKE,
+        .type           = TRACE_CTX,
+        .funcs          = &trace_ctx_funcs,
+};
+static struct trace_event_functions trace_wake_funcs = {
        .trace          = trace_wake_print,
        .raw            = trace_wake_raw,
        .hex            = trace_wake_hex,
        .binary         = trace_ctxwake_bin,
 };
+static struct trace_event trace_wake_event = {
+        .type           = TRACE_WAKE,
+        .funcs          = &trace_wake_funcs,
+};
 /* TRACE_SPECIAL */
 static enum print_line_t trace_special_print(struct trace_iterator *iter,
-                                             int flags)
+                                             int flags, struct trace_event *event)
 {
        struct special_entry *field;
@@ -1046,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
 }
 static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct special_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1061,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
 }
 static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct special_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1075,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
-static struct trace_event trace_special_event = {
+static struct trace_event_functions trace_special_funcs = {
-        .type           = TRACE_SPECIAL,
        .trace          = trace_special_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
+static struct trace_event trace_special_event = {
+        .type           = TRACE_SPECIAL,
+        .funcs          = &trace_special_funcs,
+};
 /* TRACE_STACK */
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct stack_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1114,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static struct trace_event trace_stack_event = {
+static struct trace_event_functions trace_stack_funcs = {
-        .type           = TRACE_STACK,
        .trace          = trace_stack_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
+static struct trace_event trace_stack_event = {
+        .type           = TRACE_STACK,
+        .funcs          = &trace_stack_funcs,
+};
 /* TRACE_USER_STACK */
 static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
-                                                int flags)
+                                                int flags, struct trace_event *event)
 {
        struct userstack_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1143,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static struct trace_event trace_user_stack_event = {
+static struct trace_event_functions trace_user_stack_funcs = {
-        .type           = TRACE_USER_STACK,
        .trace          = trace_user_stack_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
+static struct trace_event trace_user_stack_event = {
+        .type           = TRACE_USER_STACK,
+        .funcs          = &trace_user_stack_funcs,
+};
 /* TRACE_BPRINT */
 static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
 {
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
@@ -1178,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
 static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+                 struct trace_event *event)
 {
        struct bprint_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1197,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
+static struct trace_event_functions trace_bprint_funcs = {
+        .trace          = trace_bprint_print,
+        .raw            = trace_bprint_raw,
+};
 static struct trace_event trace_bprint_event = {
        .type           = TRACE_BPRINT,
-        .trace          = trace_bprint_print,
+        .funcs          = &trace_bprint_funcs,
-        .raw            = trace_bprint_raw,
 };
 /* TRACE_PRINT */
 static enum print_line_t trace_print_print(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct print_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1225,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+                                         struct trace_event *event)
 {
        struct print_entry *field;
@@ -1240,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static struct trace_event trace_print_event = {
+static struct trace_event_functions trace_print_funcs = {
-        .type           = TRACE_PRINT,
        .trace          = trace_print_print,
        .raw            = trace_print_raw,
 };
+static struct trace_event trace_print_event = {
+        .type           = TRACE_PRINT,
+        .funcs          = &trace_print_funcs,
+};
 static struct trace_event *events[] __initdata = {
        &trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
-                                         int flags);
+                                         int flags, struct trace_event *event);
 extern int
 trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 }
 static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
-                        struct task_struct *next)
 {
        struct trace_array_cpu *data;
        unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 {
        struct trace_array_cpu *data;
        unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
 {
        int ret;
-        ret = register_trace_sched_wakeup(probe_sched_wakeup);
+        ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup\n");
                return ret;
        }
-        ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+        ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup_new\n");
                goto fail_deprobe;
        }
-        ret = register_trace_sched_switch(probe_sched_switch);
+        ret = register_trace_sched_switch(probe_sched_switch, NULL);
        if (ret) {
                pr_info("sched trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
        return ret;
 fail_deprobe_wake_new:
-        unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+        unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
 fail_deprobe:
-        unregister_trace_sched_wakeup(probe_sched_wakeup);
+        unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
        return ret;
 }
 static void tracing_sched_unregister(void)
 {
-        unregister_trace_sched_switch(probe_sched_switch);
+        unregister_trace_sched_switch(probe_sched_switch, NULL);
-        unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+        unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
-        unregister_trace_sched_wakeup(probe_sched_wakeup);
+        unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
 }
 static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
        return 1;
 }
-static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
 {
        if (task != wakeup_task)
                return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
 }
 static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
+probe_wakeup_sched_switch(void *ignore,
-        struct task_struct *next)
+                          struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array_cpu *data;
        cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
 {
        struct trace_array_cpu *data;
        int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
 {
        int ret;
-        ret = register_trace_sched_wakeup(probe_wakeup);
+        ret = register_trace_sched_wakeup(probe_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup\n");
                return;
        }
-        ret = register_trace_sched_wakeup_new(probe_wakeup);
+        ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup_new\n");
                goto fail_deprobe;
        }
-        ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+        ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
        if (ret) {
                pr_info("sched trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_switch\n");
                goto fail_deprobe_wake_new;
        }
-        ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+        ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
        return;
 fail_deprobe_wake_new:
-        unregister_trace_sched_wakeup_new(probe_wakeup);
+        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
 fail_deprobe:
-        unregister_trace_sched_wakeup(probe_wakeup);
+        unregister_trace_sched_wakeup(probe_wakeup, NULL);
 }
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
        unregister_ftrace_function(&trace_ops);
-        unregister_trace_sched_switch(probe_wakeup_sched_switch);
+        unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
-        unregister_trace_sched_wakeup_new(probe_wakeup);
+        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
-        unregister_trace_sched_wakeup(probe_wakeup);
+        unregister_trace_sched_wakeup(probe_wakeup, NULL);
-        unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
+        unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
 }
 static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
 #include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 static inline int trace_valid_entry(struct trace_entry *entry)
 {
@@ -16,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_BRANCH:
        case TRACE_GRAPH_ENT:
        case TRACE_GRAPH_RET:
-        case TRACE_HW_BRANCHES:
        case TRACE_KSYM:
                return 1;
        }
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
        struct trace_entry *entry;
        unsigned int loops = 0;
-        while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+        while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
                entry = ring_buffer_event_data(event);
                /*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST     100000000
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
                ftrace_graph_stop();
                printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
                if (ftrace_dump_on_oops)
-                        __ftrace_dump(false);
+                        __ftrace_dump(false, DUMP_ALL);
                return 0;
        }
@@ -754,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
-                                   struct trace_array *tr)
-{
-        struct trace_iterator *iter;
-        struct tracer tracer;
-        unsigned long count;
-        int ret;
-        if (!trace->open) {
-                printk(KERN_CONT "missing open function...");
-                return -1;
-        }
-        ret = tracer_init(trace, tr);
-        if (ret) {
-                warn_failed_init_tracer(trace, ret);
-                return ret;
-        }
-        /*
-         * The hw-branch tracer needs to collect the trace from the various
-         * cpu trace buffers - before tracing is stopped.
-         */
-        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-        if (!iter)
-                return -ENOMEM;
-        memcpy(&tracer, trace, sizeof(tracer));
-        iter->trace = &tracer;
-        iter->tr = tr;
-        iter->pos = -1;
-        mutex_init(&iter->mutex);
-        trace->open(iter);
-        mutex_destroy(&iter->mutex);
-        kfree(iter);
-        tracing_stop();
-        ret = trace_test_buffer(tr, &count);
-        trace->reset(tr);
-        tracing_start();
-        if (!ret && !count) {
-                printk(KERN_CONT "no entries found..");
-                ret = -1;
-        }
-        return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
 #ifdef CONFIG_KSYM_TRACER
 static int ksym_selftest_dummy;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        unsigned long val, flags;
        char buf[64];
        int ret;
+        int cpu;
        if (count >= sizeof(buf))
                return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
                return ret;
        local_irq_save(flags);
+        /*
+         * In case we trace inside arch_spin_lock() or after (NMI),
+         * we will cause circular lock, so we also need to increase
+         * the percpu trace_active here.
+         */
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&max_stack_lock);
        *ptr = val;
        arch_spin_unlock(&max_stack_lock);
+        per_cpu(trace_active, cpu)--;
        local_irq_restore(flags);
        return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+        int cpu;
        local_irq_disable();
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
        arch_spin_lock(&max_stack_lock);
        if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void t_stop(struct seq_file *m, void *p)
 {
+        int cpu;
        arch_spin_unlock(&max_stack_lock);
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)--;
        local_irq_enable();
 }
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..34e35804304b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
 #include <trace/syscall.h>
 #include <trace/events/syscalls.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
@@ -14,6 +15,54 @@ static int sys_refcount_exit;
 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+static int syscall_enter_register(struct ftrace_event_call *event,
+                                 enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+                                 enum trace_reg type);
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
+        return &entry->enter_fields;
+}
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
+        return &entry->exit_fields;
+}
+struct trace_event_functions enter_syscall_print_funcs = {
+        .trace                  = print_syscall_enter,
+};
+struct trace_event_functions exit_syscall_print_funcs = {
+        .trace                  = print_syscall_exit,
+};
+struct ftrace_event_class event_class_syscall_enter = {
+        .system                 = "syscalls",
+        .reg                    = syscall_enter_register,
+        .define_fields          = syscall_enter_define_fields,
+        .get_fields             = syscall_get_enter_fields,
+        .raw_init               = init_syscall_trace,
+};
+struct ftrace_event_class event_class_syscall_exit = {
+        .system                 = "syscalls",
+        .reg                    = syscall_exit_register,
+        .define_fields          = syscall_exit_define_fields,
+        .get_fields             = syscall_get_exit_fields,
+        .raw_init               = init_syscall_trace,
+};
 extern unsigned long __start_syscalls_metadata[];
 extern unsigned long __stop_syscalls_metadata[];
@@ -52,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 }
 enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+                    struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
@@ -67,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
        if (!entry)
                goto end;
-        if (entry->enter_event->id != ent->type) {
+        if (entry->enter_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                goto end;
        }
@@ -104,7 +154,8 @@ end:
 }
 enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
@@ -122,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
                return TRACE_TYPE_HANDLED;
        }
-        if (entry->exit_event->id != ent->type) {
+        if (entry->exit_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                return TRACE_TYPE_UNHANDLED;
        }
@@ -143,73 +194,68 @@ extern char *__bad_type_size(void);
                #type, #name, offsetof(typeof(trace), name),            \
                sizeof(trace.name), is_signed_type(type)
-int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
+static
+int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
        int i;
-        int ret;
+        int pos = 0;
-        struct syscall_metadata *entry = call->data;
-        struct syscall_trace_enter trace;
-        int offset = offsetof(struct syscall_trace_enter, args);
-        ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+        /* When len=0, we just calculate the needed length */
-                               "\tsigned:%u;\n",
+#define LEN_OR_ZERO (len ? len - pos : 0)
-                               SYSCALL_FIELD(int, nr));
-        if (!ret)
-                return 0;
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
-                                        entry->args[i]);
+                                entry->args[i], sizeof(unsigned long),
-                if (!ret)
+                                i == entry->nb_args - 1 ? "" : ", ");
-                        return 0;
-                ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
-                                       "\tsigned:%u;\n", offset,
-                                       sizeof(unsigned long),
-                                       is_signed_type(unsigned long));
-                if (!ret)
-                        return 0;
-                offset += sizeof(unsigned long);
        }
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
-        trace_seq_puts(s, "\nprint fmt: \"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO,
-                                        sizeof(unsigned long),
+                                ", ((unsigned long)(REC->%s))", entry->args[i]);
-                                        i == entry->nb_args - 1 ? "" : ", ");
-                if (!ret)
-                        return 0;
        }
-        trace_seq_putc(s, '"');
-        for (i = 0; i < entry->nb_args; i++) {
+#undef LEN_OR_ZERO
-                ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
-                                       entry->args[i]);
-                if (!ret)
-                        return 0;
-        }
-        return trace_seq_putc(s, '\n');
+        /* return the length of print_fmt */
+        return pos;
 }
-int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
+static int set_syscall_print_fmt(struct ftrace_event_call *call)
 {
-        int ret;
+        char *print_fmt;
-        struct syscall_trace_exit trace;
+        int len;
+        struct syscall_metadata *entry = call->data;
-        ret = trace_seq_printf(s,
+        if (entry->enter_event != call) {
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+                call->print_fmt = "\"0x%lx\", REC->ret";
-                               "\tsigned:%u;\n"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-                               "\tsigned:%u;\n",
-                               SYSCALL_FIELD(int, nr),
-                               SYSCALL_FIELD(long, ret));
-        if (!ret)
                return 0;
+        }
+        /* First: called with 0 length to calculate the needed length */
+        len = __set_enter_print_fmt(entry, NULL, 0);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
-        return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
+        /* Second: actually write the @print_fmt */
+        __set_enter_print_fmt(entry, print_fmt, len + 1);
+        call->print_fmt = print_fmt;
+        return 0;
+}
+static void free_syscall_print_fmt(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
+        if (entry->enter_event == call)
+                kfree(call->print_fmt);
 }
-int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_enter trace;
        struct syscall_metadata *meta = call->data;
@@ -232,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_exit trace;
        int ret;
@@ -247,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-void ftrace_syscall_enter(struct pt_regs *regs, long id)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
@@ -269,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
        event = trace_current_buffer_lock_reserve(&buffer,
-                        sys_data->enter_event->id, size, 0, 0);
+                        sys_data->enter_event->event.type, size, 0, 0);
        if (!event)
                return;
@@ -282,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
@@ -301,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
                return;
        event = trace_current_buffer_lock_reserve(&buffer,
-                        sys_data->exit_event->id, sizeof(*entry), 0, 0);
+                        sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
        if (!event)
                return;
@@ -324,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_enter)
-                ret = register_trace_sys_enter(ftrace_syscall_enter);
+                ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
        if (!ret) {
                set_bit(num, enabled_enter_syscalls);
                sys_refcount_enter++;
@@ -344,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
        sys_refcount_enter--;
        clear_bit(num, enabled_enter_syscalls);
        if (!sys_refcount_enter)
-                unregister_trace_sys_enter(ftrace_syscall_enter);
+                unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
@@ -358,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_exit)
-                ret = register_trace_sys_exit(ftrace_syscall_exit);
+                ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
        if (!ret) {
                set_bit(num, enabled_exit_syscalls);
                sys_refcount_exit++;
@@ -378,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
        sys_refcount_exit--;
        clear_bit(num, enabled_exit_syscalls);
        if (!sys_refcount_exit)
-                unregister_trace_sys_exit(ftrace_syscall_exit);
+                unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
@@ -386,12 +432,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
-        id = register_ftrace_event(call->event);
+        if (set_syscall_print_fmt(call) < 0)
-        if (!id)
+                return -ENOMEM;
-                return -ENODEV;
-        call->id = id;
+        id = trace_event_raw_init(call);
-        INIT_LIST_HEAD(&call->fields);
-        return 0;
+        if (id < 0) {
+                free_syscall_print_fmt(call);
+                return id;
+        }
+        return id;
+}
+unsigned long __init arch_syscall_addr(int nr)
+{
+        return (unsigned long)sys_call_table[nr];
 }
 int __init init_ftrace_syscalls(void)
@@ -421,27 +477,24 @@ int __init init_ftrace_syscalls(void)
 }
 core_initcall(init_ftrace_syscalls);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
-static int sys_prof_refcount_enter;
+static int sys_perf_refcount_enter;
-static int sys_prof_refcount_exit;
+static int sys_perf_refcount_exit;
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
-        unsigned long flags;
+        struct hlist_head *head;
-        char *trace_buf;
-        char *raw_data;
        int syscall_nr;
        int rctx;
        int size;
-        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -453,44 +506,24 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                      "profile buffer not large enough"))
+                      "perf buffer not large enough"))
                return;
-        /* Protect the per cpu buffer, begin the rcu read side */
+        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-        local_irq_save(flags);
+                                sys_data->enter_event->event.type, regs, &rctx);
+        if (!rec)
-        rctx = perf_swevent_get_recursion_context();
+                return;
-        if (rctx < 0)
-                goto end_recursion;
-        cpu = smp_processor_id();
-        trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
-                goto end;
-        raw_data = per_cpu_ptr(trace_buf, cpu);
-        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        rec = (struct syscall_trace_enter *) raw_data;
-        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->enter_event->id;
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-        perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
-end:
+        head = this_cpu_ptr(sys_data->enter_event->perf_events);
-        perf_swevent_put_recursion_context(rctx);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
-end_recursion:
-        local_irq_restore(flags);
 }
-int prof_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -498,47 +531,44 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                ret = register_trace_sys_enter(prof_syscall_enter);
+                ret = register_trace_sys_enter(perf_syscall_enter, NULL);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall entry trace point");
        } else {
-                set_bit(num, enabled_prof_enter_syscalls);
+                set_bit(num, enabled_perf_enter_syscalls);
-                sys_prof_refcount_enter++;
+                sys_perf_refcount_enter++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void prof_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_enter--;
+        sys_perf_refcount_enter--;
-        clear_bit(num, enabled_prof_enter_syscalls);
+        clear_bit(num, enabled_perf_enter_syscalls);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                unregister_trace_sys_enter(prof_syscall_enter);
+                unregister_trace_sys_enter(perf_syscall_enter, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
-        unsigned long flags;
+        struct hlist_head *head;
        int syscall_nr;
-        char *trace_buf;
-        char *raw_data;
        int rctx;
        int size;
-        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -553,45 +583,23 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
         * Impossible, but be paranoid with the future
         * How to put this check outside runtime?
         */
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                "exit event has grown above profile buffer size"))
+                "exit event has grown above perf buffer size"))
                return;
-        /* Protect the per cpu buffer, begin the rcu read side */
+        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-        local_irq_save(flags);
+                                sys_data->exit_event->event.type, regs, &rctx);
+        if (!rec)
-        rctx = perf_swevent_get_recursion_context();
+                return;
-        if (rctx < 0)
-                goto end_recursion;
-        cpu = smp_processor_id();
-        trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
-                goto end;
-        raw_data = per_cpu_ptr(trace_buf, cpu);
-        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        rec = (struct syscall_trace_exit *)raw_data;
-        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->exit_event->id;
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
-        perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
+        head = this_cpu_ptr(sys_data->exit_event->perf_events);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
-end:
-        perf_swevent_put_recursion_context(rctx);
-end_recursion:
-        local_irq_restore(flags);
 }
-int prof_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -599,33 +607,73 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                ret = register_trace_sys_exit(prof_syscall_exit);
+                ret = register_trace_sys_exit(perf_syscall_exit, NULL);
        if (ret) {
                pr_info("event trace: Could not activate"
-                                "syscall entry trace point");
+                                "syscall exit trace point");
        } else {
-                set_bit(num, enabled_prof_exit_syscalls);
+                set_bit(num, enabled_perf_exit_syscalls);
-                sys_prof_refcount_exit++;
+                sys_perf_refcount_exit++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void prof_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_exit--;
+        sys_perf_refcount_exit--;
-        clear_bit(num, enabled_prof_exit_syscalls);
+        clear_bit(num, enabled_perf_exit_syscalls);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                unregister_trace_sys_exit(prof_syscall_exit);
+                unregister_trace_sys_exit(perf_syscall_exit, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
+#endif /* CONFIG_PERF_EVENTS */
+static int syscall_enter_register(struct ftrace_event_call *event,
+                                 enum trace_reg type)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return reg_event_syscall_enter(event);
+        case TRACE_REG_UNREGISTER:
+                unreg_event_syscall_enter(event);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return perf_sysenter_enable(event);
+        case TRACE_REG_PERF_UNREGISTER:
+                perf_sysenter_disable(event);
+                return 0;
 #endif
+        }
+        return 0;
+}
+static int syscall_exit_register(struct ftrace_event_call *event,
+                                 enum trace_reg type)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return reg_event_syscall_exit(event);
+        case TRACE_REG_UNREGISTER:
+                unreg_event_syscall_exit(event);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return perf_sysexit_enable(event);
+        case TRACE_REG_PERF_UNREGISTER:
+                perf_sysexit_disable(event);
+                return 0;
+#endif
+        }
+        return 0;
+}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
 #include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/kref.h>
 #include "trace_stat.h"
 #include "trace.h"
@@ -48,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
 /* Insertion of a work */
 static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+                          struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -69,7 +71,8 @@ found:
 /* Execution of a work */
 static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+                          struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -89,7 +92,8 @@ found:
 }
 /* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+                                     struct task_struct *wq_thread, int cpu)
 {
        struct cpu_workqueue_stats *cws;
        unsigned long flags;
@@ -113,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 }
 /* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
 {
        /* Workqueue only execute on one cpu */
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -258,19 +263,19 @@ int __init trace_workqueue_early_init(void)
 {
        int ret, cpu;
-        ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+        ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
        if (ret)
                goto out;
-        ret = register_trace_workqueue_execution(probe_workqueue_execution);
+        ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
        if (ret)
                goto no_insertion;
-        ret = register_trace_workqueue_creation(probe_workqueue_creation);
+        ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
        if (ret)
                goto no_execution;
-        ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+        ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
        if (ret)
                goto no_creation;
@@ -282,11 +287,11 @@ int __init trace_workqueue_early_init(void)
        return 0;
 no_creation:
-        unregister_trace_workqueue_creation(probe_workqueue_creation);
+        unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
 no_execution:
-        unregister_trace_workqueue_execution(probe_workqueue_execution);
+        unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
 no_insertion:
-        unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+        unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
 out:
        pr_warning("trace_workqueue: unable to trace workqueues\n");
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
 */
 struct tracepoint_entry {
        struct hlist_node hlist;
-        void **funcs;
+        struct tracepoint_func *funcs;
        int refcount;   /* Number of times armed. 0 if disarmed. */
        char name[0];
 };
@@ -64,12 +64,12 @@ struct tp_probes {
                struct rcu_head rcu;
                struct list_head list;
        } u;
-        void *probes[0];
+        struct tracepoint_func probes[0];
 };
 static inline void *allocate_probes(int count)
 {
-        struct tp_probes *p  = kmalloc(count * sizeof(void *)
+        struct tp_probes *p  = kmalloc(count * sizeof(struct tracepoint_func)
                        + sizeof(struct tp_probes), GFP_KERNEL);
        return p == NULL ? NULL : p->probes;
 }
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
        kfree(container_of(head, struct tp_probes, u.rcu));
 }
-static inline void release_probes(void *old)
+static inline void release_probes(struct tracepoint_func *old)
 {
        if (old) {
                struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
        if (!tracepoint_debug || !entry->funcs)
                return;
-        for (i = 0; entry->funcs[i]; i++)
+        for (i = 0; entry->funcs[i].func; i++)
-                printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+                printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
 }
-static void *
+static struct tracepoint_func *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_add_probe(struct tracepoint_entry *entry,
+                           void *probe, void *data)
 {
        int nr_probes = 0;
-        void **old, **new;
+        struct tracepoint_func *old, *new;
        WARN_ON(!probe);
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
        old = entry->funcs;
        if (old) {
                /* (N -> N+1), (N != 0, 1) probes */
-                for (nr_probes = 0; old[nr_probes]; nr_probes++)
+                for (nr_probes = 0; old[nr_probes].func; nr_probes++)
-                        if (old[nr_probes] == probe)
+                        if (old[nr_probes].func == probe &&
+                            old[nr_probes].data == data)
                                return ERR_PTR(-EEXIST);
        }
        /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
        if (new == NULL)
                return ERR_PTR(-ENOMEM);
        if (old)
-                memcpy(new, old, nr_probes * sizeof(void *));
+                memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
-        new[nr_probes] = probe;
+        new[nr_probes].func = probe;
-        new[nr_probes + 1] = NULL;
+        new[nr_probes].data = data;
+        new[nr_probes + 1].func = NULL;
        entry->refcount = nr_probes + 1;
        entry->funcs = new;
        debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 }
 static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
+                              void *probe, void *data)
 {
        int nr_probes = 0, nr_del = 0, i;
-        void **old, **new;
+        struct tracepoint_func *old, *new;
        old = entry->funcs;
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
        debug_print_probes(entry);
        /* (N -> M), (N > 1, M >= 0) probes */
-        for (nr_probes = 0; old[nr_probes]; nr_probes++) {
+        for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-                if ((!probe || old[nr_probes] == probe))
+                if (!probe ||
+                    (old[nr_probes].func == probe &&
+                     old[nr_probes].data == data))
                        nr_del++;
        }
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
                new = allocate_probes(nr_probes - nr_del + 1);
                if (new == NULL)
                        return ERR_PTR(-ENOMEM);
-                for (i = 0; old[i]; i++)
+                for (i = 0; old[i].func; i++)
-                        if ((probe && old[i] != probe))
+                        if (probe &&
+                            (old[i].func != probe || old[i].data != data))
                                new[j++] = old[i];
-                new[nr_probes - nr_del] = NULL;
+                new[nr_probes - nr_del].func = NULL;
                entry->refcount = nr_probes - nr_del;
                entry->funcs = new;
        }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
        module_update_tracepoints();
 }
-static void *tracepoint_add_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_add_probe(const char *name, void *probe, void *data)
 {
        struct tracepoint_entry *entry;
-        void *old;
+        struct tracepoint_func *old;
        entry = get_tracepoint(name);
        if (!entry) {
                entry = add_tracepoint(name);
                if (IS_ERR(entry))
-                        return entry;
+                        return (struct tracepoint_func *)entry;
        }
-        old = tracepoint_entry_add_probe(entry, probe);
+        old = tracepoint_entry_add_probe(entry, probe, data);
        if (IS_ERR(old) && !entry->refcount)
                remove_tracepoint(entry);
        return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
 * Returns 0 if ok, error value on error.
 * The probe address must at least be aligned on the architecture pointer size.
 */
-int tracepoint_probe_register(const char *name, void *probe)
+int tracepoint_probe_register(const char *name, void *probe, void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_add_probe(name, probe);
+        old = tracepoint_add_probe(name, probe, data);
        mutex_unlock(&tracepoints_mutex);
        if (IS_ERR(old))
                return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static void *tracepoint_remove_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_remove_probe(const char *name, void *probe, void *data)
 {
        struct tracepoint_entry *entry;
-        void *old;
+        struct tracepoint_func *old;
        entry = get_tracepoint(name);
        if (!entry)
                return ERR_PTR(-ENOENT);
-        old = tracepoint_entry_remove_probe(entry, probe);
+        old = tracepoint_entry_remove_probe(entry, probe, data);
        if (IS_ERR(old))
                return old;
        if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
 * itself uses stop_machine(), which insures that every preempt disabled section
 * have finished.
 */
-int tracepoint_probe_unregister(const char *name, void *probe)
+int tracepoint_probe_unregister(const char *name, void *probe, void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_remove_probe(name, probe);
+        old = tracepoint_remove_probe(name, probe, data);
        mutex_unlock(&tracepoints_mutex);
        if (IS_ERR(old))
                return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
 *
 * caller must call tracepoint_probe_update_all()
 */
-int tracepoint_probe_register_noupdate(const char *name, void *probe)
+int tracepoint_probe_register_noupdate(const char *name, void *probe,
+                                       void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_add_probe(name, probe);
+        old = tracepoint_add_probe(name, probe, data);
        if (IS_ERR(old)) {
                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
 *
 * caller must call tracepoint_probe_update_all()
 */
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+                                         void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_remove_probe(name, probe);
+        old = tracepoint_remove_probe(name, probe, data);
        if (IS_ERR(old)) {
                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
+#include <linux/mm.h>
 /*
 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
-#include "cred-internals.h"
 struct user_namespace init_user_ns = {
        .kref = {
@@ -56,9 +55,6 @@ struct user_struct root_user = {
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
        .user_ns        = &init_user_ns,
-#ifdef CONFIG_USER_SCHED
-        .tg             = &init_task_group,
-#endif
 };
 /*
@@ -75,268 +71,6 @@ static void uid_hash_remove(struct user_struct *up)
        put_user_ns(up->user_ns);
 }
-#ifdef CONFIG_USER_SCHED
-static void sched_destroy_user(struct user_struct *up)
-{
-        sched_destroy_group(up->tg);
-}
-static int sched_create_user(struct user_struct *up)
-{
-        int rc = 0;
-        up->tg = sched_create_group(&root_task_group);
-        if (IS_ERR(up->tg))
-                rc = -ENOMEM;
-        set_tg_uid(up);
-        return rc;
-}
-#else   /* CONFIG_USER_SCHED */
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-#endif  /* CONFIG_USER_SCHED */
-#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
-                        /* possibly resurrect an "almost deleted" object */
-                        if (atomic_inc_return(&user->__count) == 1)
-                                cancel_delayed_work(&user->work);
-                        return user;
-                }
-        }
-        return NULL;
-}
-static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
-static DEFINE_MUTEX(uids_mutex);
-static inline void uids_mutex_lock(void)
-{
-        mutex_lock(&uids_mutex);
-}
-static inline void uids_mutex_unlock(void)
-{
-        mutex_unlock(&uids_mutex);
-}
-/* uid directory attributes */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static ssize_t cpu_shares_show(struct kobject *kobj,
-                               struct kobj_attribute *attr,
-                               char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
-}
-static ssize_t cpu_shares_store(struct kobject *kobj,
-                                struct kobj_attribute *attr,
-                                const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long shares;
-        int rc;
-        sscanf(buf, "%lu", &shares);
-        rc = sched_group_set_shares(up->tg, shares);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_share_attr =
-        __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
-}
-static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_runtime;
-        int rc;
-        sscanf(buf, "%ld", &rt_runtime);
-        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_runtime_attr =
-        __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
-static ssize_t cpu_rt_period_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
-}
-static ssize_t cpu_rt_period_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_period;
-        int rc;
-        sscanf(buf, "%lu", &rt_period);
-        rc = sched_group_set_rt_period(up->tg, rt_period);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_period_attr =
-        __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
-#endif
-/* default attributes per uid directory */
-static struct attribute *uids_attributes[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        &cpu_share_attr.attr,
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        &cpu_rt_runtime_attr.attr,
-        &cpu_rt_period_attr.attr,
-#endif
-        NULL
-};
-/* the lifetime of user_struct is not managed by the core (now) */
-static void uids_release(struct kobject *kobj)
-{
-        return;
-}
-static struct kobj_type uids_ktype = {
-        .sysfs_ops = &kobj_sysfs_ops,
-        .default_attrs = uids_attributes,
-        .release = uids_release,
-};
-/*
- * Create /sys/kernel/uids/<uid>/cpu_share file for this user
- * We do not create this file for users in a user namespace (until
- * sysfs tagging is implemented).
- *
- * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
- */
-static int uids_user_create(struct user_struct *up)
-{
-        struct kobject *kobj = &up->kobj;
-        int error;
-        memset(kobj, 0, sizeof(struct kobject));
-        if (up->user_ns != &init_user_ns)
-                return 0;
-        kobj->kset = uids_kset;
-        error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
-        if (error) {
-                kobject_put(kobj);
-                goto done;
-        }
-        kobject_uevent(kobj, KOBJ_ADD);
-done:
-        return error;
-}
-/* create these entries in sysfs:
- *      "/sys/kernel/uids" directory
- *      "/sys/kernel/uids/0" directory (for root user)
- *      "/sys/kernel/uids/0/cpu_share" file (for root user)
- */
-int __init uids_sysfs_init(void)
-{
-        uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
-        if (!uids_kset)
-                return -ENOMEM;
-        return uids_user_create(&root_user);
-}
-/* delayed work function to remove sysfs directory for a user and free up
- * corresponding structures.
- */
-static void cleanup_user_struct(struct work_struct *w)
-{
-        struct user_struct *up = container_of(w, struct user_struct, work.work);
-        unsigned long flags;
-        int remove_user = 0;
-        /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
-         * atomic.
-         */
-        uids_mutex_lock();
-        spin_lock_irqsave(&uidhash_lock, flags);
-        if (atomic_read(&up->__count) == 0) {
-                uid_hash_remove(up);
-                remove_user = 1;
-        }
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-        if (!remove_user)
-                goto done;
-        if (up->user_ns == &init_user_ns) {
-                kobject_uevent(&up->kobj, KOBJ_REMOVE);
-                kobject_del(&up->kobj);
-                kobject_put(&up->kobj);
-        }
-        sched_destroy_user(up);
-        key_put(up->uid_keyring);
-        key_put(up->session_keyring);
-        kmem_cache_free(uid_cachep, up);
-done:
-        uids_mutex_unlock();
-}
-/* IRQs are disabled and uidhash_lock is held upon function entry.
- * IRQ state (as stored in flags) is restored and uidhash_lock released
- * upon function exit.
- */
-static void free_user(struct user_struct *up, unsigned long flags)
-{
-        INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-        schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-}
-#else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
 static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
@@ -352,11 +86,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
        return NULL;
 }
-int uids_sysfs_init(void) { return 0; }
-static inline int uids_user_create(struct user_struct *up) { return 0; }
-static inline void uids_mutex_lock(void) { }
-static inline void uids_mutex_unlock(void) { }
 /* IRQs are disabled and uidhash_lock is held upon function entry.
 * IRQ state (as stored in flags) is restored and uidhash_lock released
 * upon function exit.
@@ -365,32 +94,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
-        sched_destroy_user(up);
        key_put(up->uid_keyring);
        key_put(up->session_keyring);
        kmem_cache_free(uid_cachep, up);
 }
-#endif
-#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
-/*
- * We need to check if a setuid can take place. This function should be called
- * before successfully completing the setuid.
- */
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return sched_rt_can_attach(up->tg, tsk);
-}
-#else
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return 1;
-}
-#endif
 /*
 * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
 * caller must undo that ref with free_uid().
@@ -428,11 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        struct hlist_head *hashent = uidhashentry(ns, uid);
        struct user_struct *up, *new;
-        /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
-         * atomic.
-         */
-        uids_mutex_lock();
        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);
@@ -445,14 +148,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                new->uid = uid;
                atomic_set(&new->__count, 1);
-                if (sched_create_user(new) < 0)
-                        goto out_free_user;
                new->user_ns = get_user_ns(ns);
-                if (uids_user_create(new))
-                        goto out_destoy_sched;
                /*
                 * Before adding this, check whether we raced
                 * on adding the same user already..
@@ -460,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                        /* This case is not possible when CONFIG_USER_SCHED
-                         * is defined, since we serialize alloc_uid() using
-                         * uids_mutex. Hence no need to call
-                         * sched_destroy_user() or remove_user_sysfs_dir().
-                         */
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
@@ -475,17 +167,9 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_unlock_irq(&uidhash_lock);
        }
-        uids_mutex_unlock();
        return up;
-out_destoy_sched:
-        sched_destroy_user(new);
-        put_user_ns(new->user_ns);
-out_free_user:
-        kmem_cache_free(uid_cachep, new);
 out_unlock:
-        uids_mutex_unlock();
        return NULL;
 }
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..b2d70d38dff4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -54,8 +54,8 @@ int create_user_ns(struct cred *new)
 #endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-        /* alloc_uid() incremented the userns refcount.  Just set it to 1 */
+        /* root_user holds a reference to ns, our reference can be dropped */
-        kref_set(&ns->kref, 1);
+        put_user_ns(ns);
        return 0;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..327d2deb4451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
        atomic_long_set(&work->data, new);
 }
+/*
+ * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
+ */
+static inline void clear_wq_data(struct work_struct *work)
+{
+        unsigned long flags = *work_data_bits(work) &
+                                (1UL << WORK_STRUCT_STATIC);
+        atomic_long_set(&work->data, flags);
+}
 static inline
 struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 {
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
                wait_on_work(work);
        } while (unlikely(ret < 0));
-        work_clear_pending(work);
+        clear_wq_data(work);
        return ret;
 }
@@ -774,7 +784,7 @@ void flush_delayed_work(struct delayed_work *dwork)
 {
        if (del_timer_sync(&dwork->timer)) {
                struct cpu_workqueue_struct *cwq;
-                cwq = wq_per_cpu(keventd_wq, get_cpu());
+                cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
                __queue_work(cwq, &dwork->work);
                put_cpu();
        }
@@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func)
        return 0;
 }
+/**
+ * flush_scheduled_work - ensure that any scheduled work has run to completion.
+ *
+ * Forces execution of the kernel-global workqueue and blocks until its
+ * completion.
+ *
+ * Think twice before calling this function!  It's very easy to get into
+ * trouble if you don't take great care.  Either of the following situations
+ * will lead to deadlock:
+ *
+ *      One of the work items currently on the workqueue needs to acquire
+ *      a lock held by your code or its caller.
+ *
+ *      Your code is running in the context of a work routine.
+ *
+ * They will be detected by lockdep when they occur, but the first might not
+ * occur very often.  It depends on what work items are on the workqueue and
+ * what locks they need, which you have no control over.
+ *
+ * In most situations flushing the entire workqueue is overkill; you merely
+ * need to know that a particular work item isn't queued and isn't running.
+ * In such cases you should use cancel_delayed_work_sync() or
+ * cancel_work_sync() instead.
+ */
 void flush_scheduled_work(void)
 {
        flush_workqueue(keventd_wq);
@@ -1076,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_workqueue_struct *cwq;
        struct workqueue_struct *wq;
-        int ret = NOTIFY_OK;
+        int err = 0;
        action &= ~CPU_TASKS_FROZEN;
@@ -1090,12 +1124,13 @@ undo:
                switch (action) {
                case CPU_UP_PREPARE:
-                        if (!create_workqueue_thread(cwq, cpu))
+                        err = create_workqueue_thread(cwq, cpu);
+                        if (!err)
                                break;
                        printk(KERN_ERR "workqueue [%s] for %i failed\n",
                                wq->name, cpu);
                        action = CPU_UP_CANCELED;
-                        ret = NOTIFY_BAD;
+                        err = -ENOMEM;
                        goto undo;
                case CPU_ONLINE:
@@ -1116,7 +1151,7 @@ undo:
                cpumask_clear_cpu(cpu, cpu_populated_map);
        }
-        return ret;
+        return notifier_from_errno(err);
 }
 #ifdef CONFIG_SMP
author	Michal Marek <mmarek@suse.cz>	2010-08-04 07:59:13 -0400
committer	Michal Marek <mmarek@suse.cz>	2010-08-04 07:59:13 -0400
commit	772320e84588dcbe1600ffb83e5f328f2209ac2a (patch)
tree	a7de21b79340aeaa17c58126f6b801b82c77b53a /kernel
parent	1ce53adf13a54375d2a5c7cdbe341b2558389615 (diff)
parent	9fe6206f400646a2322096b56c59891d530e8d51 (diff)