164 files changed, 18314 insertions, 9955 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
new file mode 100644
index 000000000000..88c92fb44618
--- /dev/null
+++ b/kernel/Kconfig.locks
@@ -0,0 +1,202 @@
+#
+# The ARCH_INLINE foo is necessary because select ignores "depends on"
+#
+config ARCH_INLINE_SPIN_TRYLOCK
+        bool
+config ARCH_INLINE_SPIN_TRYLOCK_BH
+        bool
+config ARCH_INLINE_SPIN_LOCK
+        bool
+config ARCH_INLINE_SPIN_LOCK_BH
+        bool
+config ARCH_INLINE_SPIN_LOCK_IRQ
+        bool
+config ARCH_INLINE_SPIN_LOCK_IRQSAVE
+        bool
+config ARCH_INLINE_SPIN_UNLOCK
+        bool
+config ARCH_INLINE_SPIN_UNLOCK_BH
+        bool
+config ARCH_INLINE_SPIN_UNLOCK_IRQ
+        bool
+config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+        bool
+config ARCH_INLINE_READ_TRYLOCK
+        bool
+config ARCH_INLINE_READ_LOCK
+        bool
+config ARCH_INLINE_READ_LOCK_BH
+        bool
+config ARCH_INLINE_READ_LOCK_IRQ
+        bool
+config ARCH_INLINE_READ_LOCK_IRQSAVE
+        bool
+config ARCH_INLINE_READ_UNLOCK
+        bool
+config ARCH_INLINE_READ_UNLOCK_BH
+        bool
+config ARCH_INLINE_READ_UNLOCK_IRQ
+        bool
+config ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+        bool
+config ARCH_INLINE_WRITE_TRYLOCK
+        bool
+config ARCH_INLINE_WRITE_LOCK
+        bool
+config ARCH_INLINE_WRITE_LOCK_BH
+        bool
+config ARCH_INLINE_WRITE_LOCK_IRQ
+        bool
+config ARCH_INLINE_WRITE_LOCK_IRQSAVE
+        bool
+config ARCH_INLINE_WRITE_UNLOCK
+        bool
+config ARCH_INLINE_WRITE_UNLOCK_BH
+        bool
+config ARCH_INLINE_WRITE_UNLOCK_IRQ
+        bool
+config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+        bool
+#
+# lock_* functions are inlined when:
+#   - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
+#
+# trylock_* functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+# unlock and unlock_irq functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#  or
+#   - DEBUG_SPINLOCK=n and PREEMPT=n
+#
+# unlock_bh and unlock_irqrestore functions are inlined when:
+#   - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
+#
+config INLINE_SPIN_TRYLOCK
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
+config INLINE_SPIN_TRYLOCK_BH
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
+config INLINE_SPIN_LOCK
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+config INLINE_SPIN_LOCK_BH
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_SPIN_LOCK_BH
+config INLINE_SPIN_LOCK_IRQ
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_SPIN_LOCK_IRQ
+config INLINE_SPIN_LOCK_IRQSAVE
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_SPIN_LOCK_IRQSAVE
+config INLINE_SPIN_UNLOCK
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+config INLINE_SPIN_UNLOCK_BH
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
+config INLINE_SPIN_UNLOCK_IRQ
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
+config INLINE_SPIN_UNLOCK_IRQRESTORE
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+config INLINE_READ_TRYLOCK
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
+config INLINE_READ_LOCK
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+config INLINE_READ_LOCK_BH
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_READ_LOCK_BH
+config INLINE_READ_LOCK_IRQ
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_READ_LOCK_IRQ
+config INLINE_READ_LOCK_IRQSAVE
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_READ_LOCK_IRQSAVE
+config INLINE_READ_UNLOCK
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
+config INLINE_READ_UNLOCK_BH
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
+config INLINE_READ_UNLOCK_IRQ
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
+config INLINE_READ_UNLOCK_IRQRESTORE
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+config INLINE_WRITE_TRYLOCK
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
+config INLINE_WRITE_LOCK
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+config INLINE_WRITE_LOCK_BH
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_WRITE_LOCK_BH
+config INLINE_WRITE_LOCK_IRQ
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_WRITE_LOCK_IRQ
+config INLINE_WRITE_LOCK_IRQSAVE
+        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
+                 ARCH_INLINE_WRITE_LOCK_IRQSAVE
+config INLINE_WRITE_UNLOCK
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
+config INLINE_WRITE_UNLOCK_BH
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
+config INLINE_WRITE_UNLOCK_IRQ
+        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
+config INLINE_WRITE_UNLOCK_IRQRESTORE
+        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+config MUTEX_SPIN_ON_OWNER
+        def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index d7c13d249b2d..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,13 +4,14 @@
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
-            sysctl.o capability.o ptrace.o timer.o user.o \
+            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o
+            async.o range.o
+obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -21,6 +22,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -82,12 +84,16 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_TINY_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
@@ -96,6 +102,9 @@ obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6bf..e4c0e1fee9b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -353,17 +353,18 @@ restart:
 void acct_exit_ns(struct pid_namespace *ns)
 {
-        struct bsd_acct_struct *acct;
+        struct bsd_acct_struct *acct = ns->bacct;
-        spin_lock(&acct_lock);
+        if (acct == NULL)
-        acct = ns->bacct;
+                return;
-        if (acct != NULL) {
-                if (acct->file != NULL)
-                        acct_file_reopen(acct, NULL, NULL);
-                kfree(acct);
+        del_timer_sync(&acct->timer);
-        }
+        spin_lock(&acct_lock);
+        if (acct->file != NULL)
+                acct_file_reopen(acct, NULL, NULL);
        spin_unlock(&acct_lock);
+        kfree(acct);
 }
 /*
@@ -536,7 +537,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
        do_div(elapsed, AHZ);
        ac.ac_btime = get_seconds() - elapsed;
        /* we really need to bite the bullet and change layout */
-        current_uid_gid(&ac.ac_uid, &ac.ac_gid);
+        ac.ac_uid = orig_cred->uid;
+        ac.ac_gid = orig_cred->gid;
 #if ACCT_VERSION==2
        ac.ac_ahz = AHZ;
 #endif
@@ -587,16 +589,6 @@ out:
 }
 /**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
-        memset(pacct, 0, sizeof(struct pacct_struct));
-        pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-/**
 * acct_collect - collect accounting information into pacct_struct
 * @exitcode: task exit code
 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        skb_get(skb);
        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
        if (err < 0) {
-                BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
                audit_log_lost("auditd dissapeared\n");
                audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f3282..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 struct audit_tree;
 struct audit_chunk;
@@ -277,7 +278,7 @@ static void untag_chunk(struct node *p)
                owner->root = NULL;
        }
-        for (i = j = 0; i < size; i++, j++) {
+        for (i = j = 0; j <= size; i++, j++) {
                struct audit_tree *s;
                if (&chunk->owners[j] == p) {
                        list_del_init(&p->list);
@@ -290,7 +291,7 @@ static void untag_chunk(struct node *p)
                if (!s) /* result of earlier fallback */
                        continue;
                get_tree(s);
-                list_replace_init(&chunk->owners[i].list, &new->owners[j].list);
+                list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
        }
        list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +374,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        for (n = 0; n < old->count; n++) {
                if (old->owners[n].owner == tree) {
                        spin_unlock(&hash_lock);
-                        put_inotify_watch(watch);
+                        put_inotify_watch(&old->watch);
                        return 0;
                }
        }
        spin_unlock(&hash_lock);
        chunk = alloc_chunk(old->count + 1);
-        if (!chunk)
+        if (!chunk) {
+                put_inotify_watch(&old->watch);
                return -ENOMEM;
+        }
        mutex_lock(&inode->inotify_mutex);
        if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +428,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        spin_unlock(&hash_lock);
        inotify_evict_watch(&old->watch);
        mutex_unlock(&inode->inotify_mutex);
-        put_inotify_watch(&old->watch);
+        put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
+        put_inotify_watch(&old->watch); /* and kill it */
        return 0;
 }
@@ -545,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
        return 0;
 }
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+        return mnt->mnt_root->d_inode == arg;
+}
 void audit_trim_trees(void)
 {
        struct list_head cursor;
@@ -556,7 +565,6 @@ void audit_trim_trees(void)
                struct path path;
                struct vfsmount *root_mnt;
                struct node *node;
-                struct list_head list;
                int err;
                tree = container_of(cursor.next, struct audit_tree, list);
@@ -574,24 +582,16 @@ void audit_trim_trees(void)
                if (!root_mnt)
                        goto skip_it;
-                list_add_tail(&list, &root_mnt->mnt_list);
                spin_lock(&hash_lock);
                list_for_each_entry(node, &tree->chunks, list) {
-                        struct audit_chunk *chunk = find_chunk(node);
+                        struct inode *inode = find_chunk(node)->watch.inode;
-                        struct inode *inode = chunk->watch.inode;
-                        struct vfsmount *mnt;
                        node->index |= 1U<<31;
-                        list_for_each_entry(mnt, &list, mnt_list) {
+                        if (iterate_mounts(compare_root, inode, root_mnt))
-                                if (mnt->mnt_root->d_inode == inode) {
+                                node->index &= ~(1U<<31);
-                                        node->index &= ~(1U<<31);
-                                        break;
-                                }
-                        }
                }
                spin_unlock(&hash_lock);
                trim_marked(tree);
                put_tree(tree);
-                list_del_init(&list);
                drop_collected_mounts(root_mnt);
 skip_it:
                mutex_lock(&audit_filter_mutex);
@@ -600,22 +600,6 @@ skip_it:
        mutex_unlock(&audit_filter_mutex);
 }
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
-                    struct path *path)
-{
-        if (mnt != path->mnt) {
-                for (;;) {
-                        if (mnt->mnt_parent == mnt)
-                                return 0;
-                        if (mnt->mnt_parent == path->mnt)
-                                        break;
-                        mnt = mnt->mnt_parent;
-                }
-                dentry = mnt->mnt_mountpoint;
-        }
-        return is_subdir(dentry, path->dentry);
-}
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 {
@@ -635,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree)
        put_tree(tree);
 }
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+        return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
 /* called with audit_filter_mutex */
 int audit_add_tree_rule(struct audit_krule *rule)
 {
        struct audit_tree *seed = rule->tree, *tree;
        struct path path;
-        struct vfsmount *mnt, *p;
+        struct vfsmount *mnt;
-        struct list_head list;
        int err;
        list_for_each_entry(tree, &tree_list, list) {
@@ -667,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
                err = -ENOMEM;
                goto Err;
        }
-        list_add_tail(&list, &mnt->mnt_list);
        get_tree(tree);
-        list_for_each_entry(p, &list, mnt_list) {
+        err = iterate_mounts(tag_mount, tree, mnt);
-                err = tag_chunk(p->mnt_root->d_inode, tree);
-                if (err)
-                        break;
-        }
-        list_del(&list);
        drop_collected_mounts(mnt);
        if (!err) {
@@ -711,31 +692,23 @@ int audit_tag_tree(char *old, char *new)
 {
        struct list_head cursor, barrier;
        int failed = 0;
-        struct path path;
+        struct path path1, path2;
        struct vfsmount *tagged;
-        struct list_head list;
-        struct vfsmount *mnt;
-        struct dentry *dentry;
        int err;
-        err = kern_path(new, 0, &path);
+        err = kern_path(new, 0, &path2);
        if (err)
                return err;
-        tagged = collect_mounts(&path);
+        tagged = collect_mounts(&path2);
-        path_put(&path);
+        path_put(&path2);
        if (!tagged)
                return -ENOMEM;
-        err = kern_path(old, 0, &path);
+        err = kern_path(old, 0, &path1);
        if (err) {
                drop_collected_mounts(tagged);
                return err;
        }
-        mnt = mntget(path.mnt);
-        dentry = dget(path.dentry);
-        path_put(&path);
-        list_add_tail(&list, &tagged->mnt_list);
        mutex_lock(&audit_filter_mutex);
        list_add(&barrier, &tree_list);
@@ -743,7 +716,7 @@ int audit_tag_tree(char *old, char *new)
        while (cursor.next != &tree_list) {
                struct audit_tree *tree;
-                struct vfsmount *p;
+                int good_one = 0;
                tree = container_of(cursor.next, struct audit_tree, list);
                get_tree(tree);
@@ -751,30 +724,19 @@ int audit_tag_tree(char *old, char *new)
                list_add(&cursor, &tree->list);
                mutex_unlock(&audit_filter_mutex);
-                err = kern_path(tree->pathname, 0, &path);
+                err = kern_path(tree->pathname, 0, &path2);
-                if (err) {
+                if (!err) {
-                        put_tree(tree);
+                        good_one = path_is_under(&path1, &path2);
-                        mutex_lock(&audit_filter_mutex);
+                        path_put(&path2);
-                        continue;
                }
-                spin_lock(&vfsmount_lock);
+                if (!good_one) {
-                if (!is_under(mnt, dentry, &path)) {
-                        spin_unlock(&vfsmount_lock);
-                        path_put(&path);
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
                        continue;
                }
-                spin_unlock(&vfsmount_lock);
-                path_put(&path);
-                list_for_each_entry(p, &list, mnt_list) {
-                        failed = tag_chunk(p->mnt_root->d_inode, tree);
-                        if (failed)
-                                break;
-                }
+                failed = iterate_mounts(tag_mount, tree, tagged);
                if (failed) {
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
@@ -815,10 +777,8 @@ int audit_tag_tree(char *old, char *new)
        }
        list_del(&barrier);
        list_del(&cursor);
-        list_del(&list);
        mutex_unlock(&audit_filter_mutex);
-        dput(dentry);
+        path_put(&path1);
-        mntput(mnt);
        drop_collected_mounts(tagged);
        return failed;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f0198..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
 #include <linux/mqueue.h>
@@ -250,7 +251,6 @@ struct audit_context {
 #endif
 };
-#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
 static inline int open_arg(int flags, int mask)
 {
        int n = ACC_MODE(flags);
@@ -1894,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
 {
        if (context->name_count >= AUDIT_NAMES) {
                if (inode)
-                        printk(KERN_DEBUG "name_count maxed, losing inode data: "
+                        printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
                               "dev=%02x:%02x, inode=%lu\n",
                               MAJOR(inode->i_sb->s_dev),
                               MINOR(inode->i_sb->s_dev),
@@ -1989,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 /**
 * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
 * @dentry: dentry being audited
 * @parent: inode of dentry parent
 *
@@ -2001,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 * must be hooked prior, in order to capture the target inode during
 * unsuccessful attempts.
 */
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
+void __audit_inode_child(const struct dentry *dentry,
                         const struct inode *parent)
 {
        int idx;
        struct audit_context *context = current->audit_context;
        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
+        const char *dname = dentry->d_name.name;
        int dirlen = 0;
        if (!context->in_syscall)
@@ -2015,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
        if (inode)
                handle_one(inode);
-        /* determine matching parent */
-        if (!dname)
-                goto add_names;
        /* parent is more likely, look for it first */
        for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c5301381837..98a51f26c136 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -12,7 +12,7 @@
 void foo(void)
 {
-        /* The enum constants to put into include/linux/bounds.h */
+        /* The enum constants to put into include/generated/bounds.h */
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
        /* End of constants */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4e17041963f5..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set);
 EXPORT_SYMBOL(__cap_full_set);
 EXPORT_SYMBOL(__cap_init_eff_set);
-#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
 int file_caps_enabled = 1;
 static int __init file_caps_disable(char *str)
@@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str)
        return 1;
 }
 __setup("no_file_caps", file_caps_disable);
-#endif
 /*
 * More recent versions of libcap are available from:
@@ -137,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
        if (pid && (pid != task_pid_vnr(current))) {
                struct task_struct *target;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                target = find_task_by_vpid(pid);
                if (!target)
@@ -145,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                else
                        ret = security_capget(target, pEp, pIp, pPp);
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);
@@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
        kernel_cap_t pE, pI, pP;
        ret = cap_validate_magic(header, &tocopy);
-        if (ret != 0)
+        if ((dataptr == NULL) || (ret != 0))
-                return ret;
+                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;
        if (get_user(pid, &header->pid))
                return -EFAULT;
@@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
 SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
 {
        struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
-        unsigned i, tocopy;
+        unsigned i, tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
@@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;
-        if (copy_from_user(&kdata, data,
+        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
-                           tocopy * sizeof(struct __user_cap_data_struct)))
+        if (copybytes > sizeof(kdata))
+                return -EFAULT;
+        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;
        for (i = 0; i < tocopy; i++) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5c..6d870f2d1228 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
+#include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
@@ -51,15 +56,21 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <asm/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
 #define SUBSYS(_x) &_x ## _subsys,
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
-static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
@@ -146,6 +157,35 @@ struct css_id {
        unsigned short stack[0]; /* Array of Length (depth+1) */
 };
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+        /*
+         * Cgroup which the event belongs to.
+         */
+        struct cgroup *cgrp;
+        /*
+         * Control file which the event associated.
+         */
+        struct cftype *cft;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 /* The list of hierarchy roots */
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 */
 static int need_forkexit_callback __read_mostly;
+#ifdef CONFIG_PROVE_LOCKING
+int cgroup_lock_is_held(void)
+{
+        return lockdep_is_held(&cgroup_mutex);
+}
+#else /* #ifdef CONFIG_PROVE_LOCKING */
+int cgroup_lock_is_held(void)
+{
+        return mutex_is_locked(&cgroup_mutex);
+}
+#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
+EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+                           struct cgroup_subsys_state *css);
 /* css_set_lock protects the list of css_set objects, and the
 * chain of tasks off each css_set.  Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
        struct hlist_node *node;
        struct css_set *cg;
-        /* Built the set of subsystem state objects that we want to
+        /*
-         * see in the new css_set */
+         * Build the set of subsystem state objects that we want to see in the
+         * new css_set. while subsystems can change globally, the entries here
+         * won't change, so no need for locking.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                if (root->subsys_bits & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
 {
        mutex_lock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_lock);
 /**
 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
 {
        mutex_unlock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_unlock);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
                        if (ret)
                                break;
                }
        return ret;
 }
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
        css_put(css);
 }
+/*
+ * Call with cgroup_mutex held. Drops reference counts on modules, including
+ * any duplicate ones that parse_cgroupfs_options took. If this function
+ * returns an error, no reference counts are touched.
+ */
 static int rebind_subsystems(struct cgroupfs_root *root,
                              unsigned long final_bits)
 {
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        struct cgroup *cgrp = &root->top_cgroup;
        int i;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        removed_bits = root->actual_subsys_bits & ~final_bits;
        added_bits = final_bits & ~root->actual_subsys_bits;
        /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                struct cgroup_subsys *ss = subsys[i];
                if (!(bit & added_bits))
                        continue;
+                /*
+                 * Nobody should tell us to do a subsys that doesn't exist:
+                 * parse_cgroupfs_options should catch that case and refcounts
+                 * ensure that subsystems won't disappear once selected.
+                 */
+                BUG_ON(ss == NULL);
                if (ss->root != &rootnode) {
                        /* Subsystem isn't free */
                        return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                unsigned long bit = 1UL << i;
                if (bit & added_bits) {
                        /* We're binding this subsystem to this hierarchy */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        if (ss->bind)
                                ss->bind(ss, cgrp);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                        mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* subsystem is now free - drop reference on module */
+                        module_put(ss->module);
                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
+                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
+                        /*
+                         * a refcount was taken, but we already had one, so
+                         * drop the extra reference.
+                         */
+                        module_put(ss->module);
+#ifdef CONFIG_MODULE_UNLOAD
+                        BUG_ON(ss->module && !module_refcount(ss->module));
+#endif
                } else {
                        /* Subsystem state shouldn't exist */
                        BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
 };
-/* Convert a hierarchy specifier into a bitmask of subsystems and
+/*
- * flags. */
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
-static int parse_cgroupfs_options(char *data,
+ * with cgroup_mutex held to protect the subsys[] array. This function takes
-                                     struct cgroup_sb_opts *opts)
+ * refcounts on subsystems to be used, unless it returns error, in which case
+ * no refcounts are taken.
+ */
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data ?: "all";
        unsigned long mask = (unsigned long)-1;
+        int i;
+        bool module_pin_failed = false;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
        mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
                        return -EINVAL;
                if (!strcmp(token, "all")) {
                        /* Add all non-disabled subsystems */
-                        int i;
                        opts->subsys_bits = 0;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                struct cgroup_subsys *ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!ss->disabled)
                                        opts->subsys_bits |= 1ul << i;
                        }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
                        if (!opts->release_agent)
                                return -ENOMEM;
                } else if (!strncmp(token, "name=", 5)) {
-                        int i;
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
                                return -ENOMEM;
                } else {
                        struct cgroup_subsys *ss;
-                        int i;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!strcmp(token, ss->name)) {
                                        if (!ss->disabled)
                                                set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
        if (!opts->subsys_bits && !opts->name)
                return -EINVAL;
+        /*
+         * Grab references on all the modules we'll need, so the subsystems
+         * don't dance around before rebind_subsystems attaches them. This may
+         * take duplicate reference counts on a subsystem that's already used,
+         * but rebind_subsystems handles this case.
+         */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & opts->subsys_bits))
+                        continue;
+                if (!try_module_get(subsys[i]->module)) {
+                        module_pin_failed = true;
+                        break;
+                }
+        }
+        if (module_pin_failed) {
+                /*
+                 * oops, one of the modules was going away. this means that we
+                 * raced with a module_delete call, and to the user this is
+                 * essentially a "subsystem doesn't exist" case.
+                 */
+                for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+                        /* drop refcounts only on the ones we took */
+                        unsigned long bit = 1UL << i;
+                        if (!(bit & opts->subsys_bits))
+                                continue;
+                        module_put(subsys[i]->module);
+                }
+                return -ENOENT;
+        }
        return 0;
 }
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+{
+        int i;
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & subsys_bits))
+                        continue;
+                module_put(subsys[i]->module);
+        }
+}
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
        int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        /* Don't allow flags to change at remount */
+        /* Don't allow flags or name to change at remount */
-        if (opts.flags != root->flags) {
+        if (opts.flags != root->flags ||
-                ret = -EINVAL;
+            (opts.name && strcmp(opts.name, root->name))) {
-                goto out_unlock;
-        }
-        /* Don't allow name to change at remount */
-        if (opts.name && strcmp(opts.name, root->name)) {
                ret = -EINVAL;
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
        }
        ret = rebind_subsystems(root, opts.subsys_bits);
-        if (ret)
+        if (ret) {
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
+        }
        /* (re)populate subsystem files */
        cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
+        INIT_LIST_HEAD(&cgrp->event_list);
+        spin_lock_init(&cgrp->event_list_lock);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct cgroupfs_root *new_root;
        /* First find the desired set of subsystems */
+        mutex_lock(&cgroup_mutex);
        ret = parse_cgroupfs_options(data, &opts);
+        mutex_unlock(&cgroup_mutex);
        if (ret)
                goto out_err;
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        new_root = cgroup_root_from_opts(&opts);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
-                goto out_err;
+                goto drop_modules;
        }
        opts.new_root = new_root;
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
-                goto out_err;
+                goto drop_modules;
        }
        root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                        free_cg_links(&tmp_cg_links);
                        goto drop_new_super;
                }
+                /*
+                 * There must be no failure case after here, since rebinding
+                 * takes care of subsystems' refcounts, which are explicitly
+                 * dropped in the failure exit path.
+                 */
                /* EBUSY should be the only error here */
                BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 * any) is not needed
                 */
                cgroup_drop_root(opts.new_root);
+                /* no subsys rebinding, so refcounts don't change */
+                drop_parsed_module_refcounts(opts.subsys_bits);
        }
        simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 drop_new_super:
        deactivate_locked_super(sb);
+ drop_modules:
+        drop_parsed_module_refcounts(opts.subsys_bits);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
-        struct dentry *dentry = rcu_dereference(cgrp->dentry);
+        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
+                                                      rcu_read_lock_held() ||
+                                                      cgroup_lock_is_held());
        if (!dentry || cgrp == dummytop) {
                /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        *--start = '\0';
        for (;;) {
                int len = dentry->d_name.len;
                if ((start -= len) < buf)
                        return -ENAMETOOLONG;
-                memcpy(start, cgrp->dentry->d_name.name, len);
+                memcpy(start, dentry->d_name.name, len);
                cgrp = cgrp->parent;
                if (!cgrp)
                        break;
-                dentry = rcu_dereference(cgrp->dentry);
+                dentry = rcu_dereference_check(cgrp->dentry,
+                                               rcu_read_lock_held() ||
+                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        memmove(buf, start, buf + buflen - start);
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_path);
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        int retval = 0;
-        struct cgroup_subsys *ss;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct css_set *cg;
        struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(ss, cgrp, tsk, false);
-                        if (retval)
+                        if (retval) {
-                                return retval;
+                                /*
+                                 * Remember on which subsystem the can_attach()
+                                 * failed, so that we only call cancel_attach()
+                                 * against the subsystems whose can_attach()
+                                 * succeeded. (See below)
+                                 */
+                                failed_ss = ss;
+                                goto out;
+                        }
                }
        }
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         */
        newcg = find_css_set(cg, cgrp);
        put_css_set(cg);
-        if (!newcg)
+        if (!newcg) {
-                return -ENOMEM;
+                retval = -ENOMEM;
+                goto out;
+        }
        task_lock(tsk);
        if (tsk->flags & PF_EXITING) {
                task_unlock(tsk);
                put_css_set(newcg);
-                return -ESRCH;
+                retval = -ESRCH;
+                goto out;
        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         * is no longer empty.
         */
        cgroup_wakeup_rmdir_waiter(cgrp);
-        return 0;
+out:
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss)
+                                /*
+                                 * This subsystem was the one that failed the
+                                 * can_attach() check earlier, so we don't need
+                                 * to call cancel_attach() against it or any
+                                 * remaining subsystems.
+                                 */
+                                break;
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, tsk, false);
+                }
+        }
+        return retval;
 }
 /*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
        }
        return true;
 }
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+                return ERR_PTR(-EINVAL);
+        return __d_cft(file->f_dentry);
+}
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
                error = PTR_ERR(dentry);
        return error;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_file);
 int cgroup_add_files(struct cgroup *cgrp,
                        struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_files);
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+        struct pid_namespace *ns = current->nsproxy->pid_ns;
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
         * the last ref-holder is trying to remove l from the list at the same
@@ -2463,12 +2661,9 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        mutex_lock(&cgrp->pidlist_mutex);
        list_for_each_entry(l, &cgrp->pidlists, links) {
                if (l->key.type == type && l->key.ns == ns) {
-                        /* found a matching list - drop the extra refcount */
-                        put_pid_ns(ns);
                        /* make sure l doesn't vanish out from under us */
                        down_write(&l->mutex);
                        mutex_unlock(&cgrp->pidlist_mutex);
-                        l->use_count++;
                        return l;
                }
        }
@@ -2476,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
        if (!l) {
                mutex_unlock(&cgrp->pidlist_mutex);
-                put_pid_ns(ns);
                return l;
        }
        init_rwsem(&l->mutex);
        down_write(&l->mutex);
        l->key.type = type;
-        l->key.ns = ns;
+        l->key.ns = get_pid_ns(ns);
        l->use_count = 0; /* don't increment here */
        l->list = NULL;
        l->owner = cgrp;
@@ -2790,6 +2984,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+        struct cgroup_event *event = container_of(work, struct cgroup_event,
+                        remove);
+        struct cgroup *cgrp = event->cgrp;
+        /* TODO: check return code */
+        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        eventfd_ctx_put(event->eventfd);
+        kfree(event);
+        dput(cgrp->dentry);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+                int sync, void *key)
+{
+        struct cgroup_event *event = container_of(wait,
+                        struct cgroup_event, wait);
+        struct cgroup *cgrp = event->cgrp;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                remove_wait_queue_locked(event->wqh, &event->wait);
+                spin_lock(&cgrp->event_list_lock);
+                list_del(&event->list);
+                spin_unlock(&cgrp->event_list_lock);
+                /*
+                 * We are in atomic context, but cgroup_event_remove() may
+                 * sleep, so we have to call it in workqueue.
+                 */
+                schedule_work(&event->remove);
+        }
+        return 0;
+}
+static void cgroup_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct cgroup_event *event = container_of(pt,
+                        struct cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        struct cgroup_event *event = NULL;
+        unsigned int efd, cfd;
+        struct file *efile = NULL;
+        struct file *cfile = NULL;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->cgrp = cgrp;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+        INIT_WORK(&event->remove, cgroup_event_remove);
+        efile = eventfd_fget(efd);
+        if (IS_ERR(efile)) {
+                ret = PTR_ERR(efile);
+                goto fail;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto fail;
+        }
+        cfile = fget(cfd);
+        if (!cfile) {
+                ret = -EBADF;
+                goto fail;
+        }
+        /* the process need read permission on control file */
+        ret = file_permission(cfile, MAY_READ);
+        if (ret < 0)
+                goto fail;
+        event->cft = __file_cft(cfile);
+        if (IS_ERR(event->cft)) {
+                ret = PTR_ERR(event->cft);
+                goto fail;
+        }
+        if (!event->cft->register_event || !event->cft->unregister_event) {
+                ret = -EINVAL;
+                goto fail;
+        }
+        ret = event->cft->register_event(cgrp, event->cft,
+                        event->eventfd, buffer);
+        if (ret)
+                goto fail;
+        if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+                event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+                ret = 0;
+                goto fail;
+        }
+        /*
+         * Events should be removed after rmdir of cgroup directory, but before
+         * destroying subsystem state objects. Let's take reference to cgroup
+         * directory dentry to do that.
+         */
+        dget(cgrp->dentry);
+        spin_lock(&cgrp->event_list_lock);
+        list_add(&event->list, &cgrp->event_list);
+        spin_unlock(&cgrp->event_list_lock);
+        fput(cfile);
+        fput(efile);
+        return 0;
+fail:
+        if (cfile)
+                fput(cfile);
+        if (event && event->eventfd && !IS_ERR(event->eventfd))
+                eventfd_ctx_put(event->eventfd);
+        if (!IS_ERR_OR_NULL(efile))
+                fput(efile);
+        kfree(event);
+        return ret;
+}
+/*
 * for the common functions, 'private' gives the type of file
 */
 /* for hysterical raisins, we can't put this on the older files */
@@ -2814,6 +3176,11 @@ static struct cftype files[] = {
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
+        {
+                .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+                .write_string = cgroup_write_event_control,
+                .mode = S_IWUGO,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -2878,8 +3245,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
        /* We need to take each hierarchy_mutex in a consistent order */
        int i;
+        /*
+         * No worry about a race with rebind_subsystems that might mess up the
+         * locking order, since both parties are under cgroup_mutex.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_lock(&ss->hierarchy_mutex);
        }
@@ -2891,6 +3264,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_unlock(&ss->hierarchy_mutex);
        }
@@ -2937,14 +3312,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_destroy;
                }
                init_cgroup_css(css, ss, cgrp);
-                if (ss->use_id)
+                if (ss->use_id) {
-                        if (alloc_css_id(ss, parent, cgrp))
+                        err = alloc_css_id(ss, parent, cgrp);
+                        if (err)
                                goto err_destroy;
+                }
                /* At error, ->destroy() callback has to free assigned ID. */
        }
@@ -3011,11 +3389,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
         * synchronization other than RCU, and the subsystem linked
         * list isn't RCU-safe */
        int i;
+        /*
+         * We won't need to lock the subsys array, because the subsystems
+         * we're concerned about aren't going anywhere since our cgroup root
+         * has a reference on them.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
-                /* Skip subsystems not in this hierarchy */
+                /* Skip subsystems not present or not in this hierarchy */
-                if (ss->root != cgrp->root)
+                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
                /* When called from check_for_release() it's possible
@@ -3089,6 +3472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct dentry *d;
        struct cgroup *parent;
        DEFINE_WAIT(wait);
+        struct cgroup_event *event, *tmp;
        int ret;
        /* the vfs holds both inode->i_mutex already */
@@ -3172,6 +3556,20 @@ again:
        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
+        /*
+         * Unregister events and notify userspace.
+         * Notify userspace about cgroup removing only after rmdir of cgroup
+         * directory to avoid race between userspace and kernelspace
+         */
+        spin_lock(&cgrp->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+                list_del(&event->list);
+                remove_wait_queue(event->wqh, &event->wait);
+                eventfd_signal(event->eventfd, 1);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&cgrp->event_list_lock);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -3206,9 +3604,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        mutex_init(&ss->hierarchy_mutex);
        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
+        /* this function shouldn't be used with modular subsystems, since they
+         * need to register a subsys_id, among other things */
+        BUG_ON(ss->module);
 }
 /**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsytem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+        int i;
+        struct cgroup_subsys_state *css;
+        /* check name and function validity */
+        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
+            ss->create == NULL || ss->destroy == NULL)
+                return -EINVAL;
+        /*
+         * we don't support callbacks in modular subsystems. this check is
+         * before the ss->module check for consistency; a subsystem that could
+         * be a module should still have no callbacks even if the user isn't
+         * compiling it as one.
+         */
+        if (ss->fork || ss->exit)
+                return -EINVAL;
+        /*
+         * an optionally modular subsystem is built-in: we want to do nothing,
+         * since cgroup_init_subsys will have already taken care of it.
+         */
+        if (ss->module == NULL) {
+                /* a few sanity checks */
+                BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+                BUG_ON(subsys[ss->subsys_id] != ss);
+                return 0;
+        }
+        /*
+         * need to register a subsys id before anything else - for example,
+         * init_cgroup_css needs it.
+         */
+        mutex_lock(&cgroup_mutex);
+        /* find the first empty slot in the array */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                if (subsys[i] == NULL)
+                        break;
+        }
+        if (i == CGROUP_SUBSYS_COUNT) {
+                /* maximum number of subsystems already registered! */
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        /* assign ourselves the subsys_id */
+        ss->subsys_id = i;
+        subsys[i] = ss;
+        /*
+         * no ss->create seems to need anything important in the ss struct, so
+         * this can happen first (i.e. before the rootnode attachment).
+         */
+        css = ss->create(ss, dummytop);
+        if (IS_ERR(css)) {
+                /* failure case - need to deassign the subsys[] slot. */
+                subsys[i] = NULL;
+                mutex_unlock(&cgroup_mutex);
+                return PTR_ERR(css);
+        }
+        list_add(&ss->sibling, &rootnode.subsys_list);
+        ss->root = &rootnode;
+        /* our new subsystem will be attached to the dummy hierarchy. */
+        init_cgroup_css(css, ss, dummytop);
+        /* init_idr must be after init_cgroup_css because it sets css->id. */
+        if (ss->use_id) {
+                int ret = cgroup_init_idr(ss, css);
+                if (ret) {
+                        dummytop->subsys[ss->subsys_id] = NULL;
+                        ss->destroy(ss, dummytop);
+                        subsys[i] = NULL;
+                        mutex_unlock(&cgroup_mutex);
+                        return ret;
+                }
+        }
+        /*
+         * Now we need to entangle the css into the existing css_sets. unlike
+         * in cgroup_init_subsys, there are now multiple css_sets, so each one
+         * will need a new pointer to it; done by iterating the css_set_table.
+         * furthermore, modifying the existing css_sets will corrupt the hash
+         * table state, so each changed css_set will need its hash recomputed.
+         * this is all done under the css_set_lock.
+         */
+        write_lock(&css_set_lock);
+        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                struct css_set *cg;
+                struct hlist_node *node, *tmp;
+                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                        /* skip entries that we already rehashed */
+                        if (cg->subsys[ss->subsys_id])
+                                continue;
+                        /* remove existing entry */
+                        hlist_del(&cg->hlist);
+                        /* set new value */
+                        cg->subsys[ss->subsys_id] = css;
+                        /* recompute hash and restore entry */
+                        new_bucket = css_set_hash(cg->subsys);
+                        hlist_add_head(&cg->hlist, new_bucket);
+                }
+        }
+        write_unlock(&css_set_lock);
+        mutex_init(&ss->hierarchy_mutex);
+        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+        ss->active = 1;
+        /* success! */
+        mutex_unlock(&cgroup_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
+/**
+ * cgroup_unload_subsys: unload a modular subsystem
+ * @ss: the subsystem to unload
+ *
+ * This function should be called in a modular subsystem's exitcall. When this
+ * function is invoked, the refcount on the subsystem's module will be 0, so
+ * the subsystem will not be attached to any hierarchy.
+ */
+void cgroup_unload_subsys(struct cgroup_subsys *ss)
+{
+        struct cg_cgroup_link *link;
+        struct hlist_head *hhead;
+        BUG_ON(ss->module == NULL);
+        /*
+         * we shouldn't be called if the subsystem is in use, and the use of
+         * try_module_get in parse_cgroupfs_options should ensure that it
+         * doesn't start being used while we're killing it off.
+         */
+        BUG_ON(ss->root != &rootnode);
+        mutex_lock(&cgroup_mutex);
+        /* deassign the subsys_id */
+        BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
+        subsys[ss->subsys_id] = NULL;
+        /* remove subsystem from rootnode's list of subsystems */
+        list_del(&ss->sibling);
+        /*
+         * disentangle the css from all css_sets attached to the dummytop. as
+         * in loading, we need to pay our respects to the hashtable gods.
+         */
+        write_lock(&css_set_lock);
+        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+                struct css_set *cg = link->cg;
+                hlist_del(&cg->hlist);
+                BUG_ON(!cg->subsys[ss->subsys_id]);
+                cg->subsys[ss->subsys_id] = NULL;
+                hhead = css_set_hash(cg->subsys);
+                hlist_add_head(&cg->hlist, hhead);
+        }
+        write_unlock(&css_set_lock);
+        /*
+         * remove subsystem's css from the dummytop and free it - need to free
+         * before marking as null because ss->destroy needs the cgrp->subsys
+         * pointer to find their state. note that this also takes care of
+         * freeing the css_id.
+         */
+        ss->destroy(ss, dummytop);
+        dummytop->subsys[ss->subsys_id] = NULL;
+        mutex_unlock(&cgroup_mutex);
+}
+EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
+/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
@@ -3236,7 +3823,8 @@ int __init cgroup_init_early(void)
        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&css_set_table[i]);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                BUG_ON(!ss->name);
@@ -3271,12 +3859,13 @@ int __init cgroup_init(void)
        if (err)
                return err;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
-                        cgroup_subsys_init_idr(ss);
+                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
        /* Add init_css_set to the hash table */
@@ -3380,9 +3969,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        int i;
        seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+        /*
+         * ideally we don't want subsystems moving around while we do this.
+         * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+         * subsys/hierarchy state.
+         */
        mutex_lock(&cgroup_mutex);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
@@ -3440,7 +4036,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
        if (need_forkexit_callback) {
                int i;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * forkexit callbacks are only supported for builtin
+                 * subsystems, and the builtin section of the subsys array is
+                 * immutable, so we don't need to lock the subsys array here.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->fork)
                                ss->fork(ss, child);
@@ -3509,7 +4110,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        struct css_set *cg;
        if (run_callbacks && need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * modular subsystems can't use callbacks, so no need to lock
+                 * the subsys array
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->exit)
                                ss->exit(ss, tsk);
@@ -3703,12 +4308,13 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
-void __css_put(struct cgroup_subsys_state *css)
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css, int count)
 {
        struct cgroup *cgrp = css->cgroup;
        int val;
        rcu_read_lock();
-        val = atomic_dec_return(&css->refcnt);
+        val = atomic_sub_return(count, &css->refcnt);
        if (val == 1) {
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3719,6 +4325,7 @@ void __css_put(struct cgroup_subsys_state *css)
        rcu_read_unlock();
        WARN_ON_ONCE(val < 1);
 }
+EXPORT_SYMBOL_GPL(__css_put);
 /*
 * Notify userspace when a cgroup is released, by running the
@@ -3800,8 +4407,11 @@ static int __init cgroup_disable(char *str)
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
+                /*
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                 * cgroup_disable, being at boot time, can't know about module
+                 * subsystems, so we don't worry about them.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (!strcmp(token, ss->name)) {
@@ -3825,31 +4435,65 @@ __setup("cgroup_disable=", cgroup_disable);
 */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
-        struct css_id *cssid = rcu_dereference(css->id);
+        struct css_id *cssid;
+        /*
+         * This css_id() can return correct value when somone has refcnt
+         * on this or this is under rcu_read_lock(). Once css->id is allocated,
+         * it's unchanged until freed.
+         */
+        cssid = rcu_dereference_check(css->id,
+                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->id;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_id);
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
-        struct css_id *cssid = rcu_dereference(css->id);
+        struct css_id *cssid;
+        cssid = rcu_dereference_check(css->id,
+                        rcu_read_lock_held() || atomic_read(&css->refcnt));
        if (cssid)
                return cssid->depth;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_depth);
+/**
+ *  css_is_ancestor - test "root" css is an ancestor of "child"
+ * @child: the css to be tested.
+ * @root: the css supporsed to be an ancestor of the child.
+ *
+ * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * But, considering usual usage, the csses should be valid objects after test.
+ * Assuming that the caller will do some action to the child if this returns
+ * returns true, the caller must take "child";s reference count.
+ * If "child" is valid object and this returns true, "root" is valid, too.
+ */
 bool css_is_ancestor(struct cgroup_subsys_state *child,
                    const struct cgroup_subsys_state *root)
 {
-        struct css_id *child_id = rcu_dereference(child->id);
+        struct css_id *child_id;
-        struct css_id *root_id = rcu_dereference(root->id);
+        struct css_id *root_id;
+        bool ret = true;
-        if (!child_id || !root_id || (child_id->depth < root_id->depth))
+        rcu_read_lock();
-                return false;
+        child_id  = rcu_dereference(child->id);
-        return child_id->stack[root_id->depth] == root_id->id;
+        root_id = rcu_dereference(root->id);
+        if (!child_id
+            || !root_id
+            || (child_id->depth < root_id->depth)
+            || (child_id->stack[root_id->depth] != root_id->id))
+                ret = false;
+        rcu_read_unlock();
+        return ret;
 }
 static void __free_css_id_cb(struct rcu_head *head)
@@ -3876,6 +4520,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_unlock(&ss->id_lock);
        call_rcu(&id->rcu_head, __free_css_id_cb);
 }
+EXPORT_SYMBOL_GPL(free_css_id);
 /*
 * This is called by init or create(). Then, calls to this function are
@@ -3925,15 +4570,14 @@ err_out:
 }
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+                                            struct cgroup_subsys_state *rootcss)
 {
        struct css_id *newid;
-        struct cgroup_subsys_state *rootcss;
        spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
-        rootcss = init_css_set.subsys[ss->subsys_id];
        newid = get_new_cssid(ss, 0);
        if (IS_ERR(newid))
                return PTR_ERR(newid);
@@ -3949,13 +4593,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 {
        int subsys_id, i, depth = 0;
        struct cgroup_subsys_state *parent_css, *child_css;
-        struct css_id *child_id, *parent_id = NULL;
+        struct css_id *child_id, *parent_id;
        subsys_id = ss->subsys_id;
        parent_css = parent->subsys[subsys_id];
        child_css = child->subsys[subsys_id];
-        depth = css_depth(parent_css) + 1;
        parent_id = parent_css->id;
+        depth = parent_id->depth;
        child_id = get_new_cssid(ss, depth);
        if (IS_ERR(child_id))
@@ -3993,6 +4637,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
        return rcu_dereference(cssid->css);
 }
+EXPORT_SYMBOL_GPL(css_lookup);
 /**
 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..e5c0244962b0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-int cgroup_frozen(struct task_struct *task)
+int cgroup_freezing_or_frozen(struct task_struct *task)
 {
        struct freezer *freezer;
        enum freezer_state state;
        task_lock(task);
        freezer = task_freezer(task);
-        state = freezer->state;
+        if (!freezer->css.cgroup->parent)
+                state = CGROUP_THAWED; /* root cgroup can't be frozen */
+        else
+                state = freezer->state;
        task_unlock(task);
-        return state == CGROUP_FROZEN;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 /*
@@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
         * No lock is needed, since the task isn't on tasklist yet,
         * so it can't be moved to another cgroup, which means the
         * freezer won't be removed and will be valid during this
-         * function call.
+         * function call.  Nevertheless, apply RCU read-side critical
+         * section to suppress RCU lockdep false positives.
         */
+        rcu_read_lock();
        freezer = task_freezer(task);
+        rcu_read_unlock();
        /*
         * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..7f40e9275fd9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/times.h>
 #include <linux/ptrace.h>
+#include <linux/gfp.h>
 #include <asm/uaccess.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..25bba73b1be3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <linux/gfp.h>
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -151,13 +152,13 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
-                if (task_cpu(p) == cpu &&
+                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
                    (!cputime_eq(p->utime, cputime_zero) ||
                     !cputime_eq(p->stime, cputime_zero)))
-                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
-                                (state = %ld, flags = %x) \n",
+                                "(state = %ld, flags = %x)\n",
-                                 p->comm, task_pid_nr(p), cpu,
+                                p->comm, task_pid_nr(p), cpu,
-                                 p->state, p->flags);
+                                p->state, p->flags);
        }
        write_unlock_irq(&tasklist_lock);
 }
@@ -209,9 +210,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                return -ENOMEM;
        cpu_hotplug_begin();
+        set_cpu_active(cpu, false);
        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
                                        hcpu, -1, &nr_calls);
        if (err == NOTIFY_BAD) {
+                set_cpu_active(cpu, true);
                nr_calls--;
                __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                          hcpu, nr_calls, NULL);
@@ -223,11 +227,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        /* Ensure that we are not runnable on dying cpu */
        cpumask_copy(old_allowed, &current->cpus_allowed);
-        set_cpus_allowed_ptr(current,
+        set_cpus_allowed_ptr(current, cpu_active_mask);
-                             cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
+                set_cpu_active(cpu, true);
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                            hcpu) == NOTIFY_BAD)
@@ -278,23 +282,8 @@ int __ref cpu_down(unsigned int cpu)
                goto out;
        }
-        set_cpu_active(cpu, false);
-        /*
-         * Make sure the all cpus did the reschedule and are not
-         * using stale version of the cpu_active_mask.
-         * This is not strictly necessary becuase stop_machine()
-         * that we run down the line already provides the required
-         * synchronization. But it's really a side effect and we do not
-         * want to depend on the innards of the stop_machine here.
-         */
-        synchronize_sched();
        err = _cpu_down(cpu, 0);
-        if (cpu_online(cpu))
-                set_cpu_active(cpu, true);
 out:
        cpu_maps_update_done();
        stop_machine_destroy();
@@ -350,7 +339,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (!cpu_possible(cpu)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_IA64)
                printk(KERN_ERR "please check additional_cpus= boot "
                                "parameter\n");
 #endif
@@ -383,19 +372,20 @@ int disable_nonboot_cpus(void)
                return error;
        cpu_maps_update_begin();
        first_cpu = cpumask_first(cpu_online_mask);
-        /* We take down all of the non-boot CPUs in one shot to avoid races
+        /*
+         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == first_cpu)
                        continue;
                error = _cpu_down(cpu, 1);
-                if (!error) {
+                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
-                        printk("CPU%d is down\n", cpu);
+                else {
-                } else {
                        printk(KERN_ERR "Error taking CPU%d down: %d\n",
                                cpu, error);
                        break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5cb469d2545..d10946748ec2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */
-/* FIXME: see the FIXME in partition_sched_domains() */
+static int generate_sched_domains(cpumask_var_t **domains,
-static int generate_sched_domains(struct cpumask **domains,
                        struct sched_domain_attr **attributes)
 {
        LIST_HEAD(q);           /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;            /* indices for partition finding loops */
-        struct cpumask *doms;   /* resulting partition; i.e. sched domains */
+        cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
-                doms = kmalloc(cpumask_size(), GFP_KERNEL);
+                ndoms = 1;
+                doms = alloc_sched_domains(ndoms);
                if (!doms)
                        goto done;
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-                cpumask_copy(doms, top_cpuset.cpus_allowed);
+                cpumask_copy(doms[0], top_cpuset.cpus_allowed);
-                ndoms = 1;
                goto done;
        }
@@ -636,7 +635,7 @@ restart:
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
-        doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
+        doms = alloc_sched_domains(ndoms);
        if (!doms)
                goto done;
@@ -656,7 +655,7 @@ restart:
                        continue;
                }
-                dp = doms + nslot;
+                dp = doms[nslot];
                if (nslot == ndoms) {
                        static int warnings = 10;
@@ -718,7 +717,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
        struct sched_domain_attr *attr;
-        struct cpumask *doms;
+        cpumask_var_t *doms;
        int ndoms;
        get_online_cpus();
@@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 }
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
        *domains = NULL;
@@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                if (retval < 0)
                        return retval;
-                if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
+                if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
                        return -EINVAL;
        }
        retval = validate_change(cs, trialcs);
@@ -921,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    call to guarantee_online_mems(), as we know no one is changing
 *    our task's cpuset.
 *
- *    Hold callback_mutex around the two modifications of our tasks
- *    mems_allowed to synchronize with cpuset_mems_allowed().
- *
 *    While the mm_struct we are migrating is typically from some
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
@@ -974,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        nodemask_t newmems;
+        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+        if (!newmems)
+                return;
        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, &newmems);
+        guarantee_online_mems(cs, newmems);
        task_lock(p);
-        cpuset_change_task_nodemask(p, &newmems);
+        cpuset_change_task_nodemask(p, newmems);
        task_unlock(p);
+        NODEMASK_FREE(newmems);
        mm = get_task_mm(p);
        if (!mm)
                return;
@@ -1052,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
 {
-        nodemask_t oldmem;
+        NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
        int retval;
        struct ptr_heap heap;
+        if (!oldmem)
+                return -ENOMEM;
        /*
         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
         * it's read-only
         */
-        if (cs == &top_cpuset)
+        if (cs == &top_cpuset) {
-                return -EACCES;
+                retval = -EACCES;
+                goto done;
+        }
        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1077,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                if (!nodes_subset(trialcs->mems_allowed,
-                                node_states[N_HIGH_MEMORY]))
+                                node_states[N_HIGH_MEMORY])) {
-                        return -EINVAL;
+                        retval =  -EINVAL;
+                        goto done;
+                }
        }
-        oldmem = cs->mems_allowed;
+        *oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+        if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
                goto done;
        }
@@ -1097,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        update_tasks_nodemask(cs, &oldmem, &heap);
+        update_tasks_nodemask(cs, oldmem, &heap);
        heap_free(&heap);
 done:
+        NODEMASK_FREE(oldmem);
        return retval;
 }
@@ -1385,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
                          struct cgroup *oldcont, struct task_struct *tsk,
                          bool threadgroup)
 {
-        nodemask_t from, to;
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
+        NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
+        if (from == NULL || to == NULL)
+                goto alloc_fail;
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
-                to = node_possible_map;
        } else {
                guarantee_online_cpus(cs, cpus_attach);
-                guarantee_online_mems(cs, &to);
        }
+        guarantee_online_mems(cs, to);
        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, &to, cs);
+        cpuset_attach_task(tsk, to, cs);
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, &to, cs);
+                        cpuset_attach_task(c, to, cs);
                }
                rcu_read_unlock();
        }
        /* change mm; only needs to be done once even if threadgroup */
-        from = oldcs->mems_allowed;
+        *from = oldcs->mems_allowed;
-        to = cs->mems_allowed;
+        *to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, &to);
+                mpol_rebind_mm(mm, to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, &from, &to);
+                        cpuset_migrate_mm(mm, from, to);
                mmput(mm);
        }
+alloc_fail:
+        NODEMASK_FREE(from);
+        NODEMASK_FREE(to);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1563,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-        nodemask_t mask;
+        NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+        int retval;
+        if (mask == NULL)
+                return -ENOMEM;
        mutex_lock(&callback_mutex);
-        mask = cs->mems_allowed;
+        *mask = cs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        return nodelist_scnprintf(page, PAGE_SIZE, mask);
+        retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+        NODEMASK_FREE(mask);
+        return retval;
 }
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1998,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
        struct cgroup *cont;
-        nodemask_t oldmems;
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return;
        list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2011,16 +2038,16 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                }
                /* Continue past cpusets with all cpus, mems online */
-                if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
+                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
-                oldmems = cp->mems_allowed;
+                *oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-                            cpu_online_mask);
+                            cpu_active_mask);
                nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
                mutex_unlock(&callback_mutex);
@@ -2031,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems, NULL);
+                        update_tasks_nodemask(cp, oldmems, NULL);
                }
        }
+        NODEMASK_FREE(oldmems);
 }
 /*
@@ -2052,14 +2080,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
                                unsigned long phase, void *unused_cpu)
 {
        struct sched_domain_attr *attr;
-        struct cpumask *doms;
+        cpumask_var_t *doms;
        int ndoms;
        switch (phase) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                break;
        default:
@@ -2068,7 +2098,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        cgroup_lock();
        mutex_lock(&callback_mutex);
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
        scan_for_empty_cpusets(&top_cpuset);
        ndoms = generate_sched_domains(&doms, &attr);
@@ -2089,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return NOTIFY_DONE;
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-        case MEM_OFFLINE:
+                *oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
-                if (action == MEM_OFFLINE)
+                update_tasks_nodemask(&top_cpuset, oldmems, NULL);
-                        scan_for_empty_cpusets(&top_cpuset);
+                break;
+        case MEM_OFFLINE:
+                /*
+                 * needn't update top_cpuset.mems_allowed explicitly because
+                 * scan_for_empty_cpusets() will update it.
+                 */
+                scan_for_empty_cpusets(&top_cpuset);
                break;
        default:
                break;
        }
        cgroup_unlock();
+        NODEMASK_FREE(oldmems);
        return NOTIFY_OK;
 }
 #endif
@@ -2115,7 +2158,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 void __init cpuset_init_smp(void)
 {
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2537,15 +2580,9 @@ const struct file_operations proc_cpuset_operations = {
 };
 #endif /* CONFIG_PROC_PID_CPUSET */
-/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+/* Display task mems_allowed in /proc/<pid>/status file. */
 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_printf(m, "Cpus_allowed:\t");
-        seq_cpumask(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
-        seq_printf(m, "Cpus_allowed_list:\t");
-        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
        seq_printf(m, "Mems_allowed:\t");
        seq_nodemask(m, &task->mems_allowed);
        seq_printf(m, "\n");
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..62af1816c235 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,6 +10,7 @@
 */
 #include <linux/module.h>
 #include <linux/cred.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/key.h>
 #include <linux/keyctl.h>
@@ -224,7 +225,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_KEYS
        new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
        if (!new->tgcred) {
-                kfree(new);
+                kmem_cache_free(cred_jar, new);
                return NULL;
        }
        atomic_set(&new->tgcred->usage, 1);
@@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void)
        new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
        if (!new)
-                return NULL;
+                goto free_tgcred;
        kdebug("prepare_usermodehelper_creds() alloc %p", new);
@@ -398,6 +399,12 @@ struct cred *prepare_usermodehelper_creds(void)
 error:
        put_cred(new);
        return NULL;
+free_tgcred:
+#ifdef CONFIG_KEYS
+        kfree(tgcred);
+#endif
+        return NULL;
 }
 /*
@@ -786,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred)
 {
        if (cred->magic != CRED_MAGIC)
                return true;
-        if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
-                return true;
 #ifdef CONFIG_SECURITY_SELINUX
        if (selinux_is_enabled()) {
                if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..31aa9332ef3f
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,584 @@
+/*
+ * early_res, could be used to replace bootmem
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/early_res.h>
+/*
+ * Early reserved memory areas.
+ */
+/*
+ * need to make sure this one is bigger enough before
+ * find_fw_memmap_area could be used
+ */
+#define MAX_EARLY_RES_X 32
+struct early_res {
+        u64 start, end;
+        char name[15];
+        char overlap_ok;
+};
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata;
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                if (end > r->start && start < r->end)
+                        break;
+        }
+        return i;
+}
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+        int j;
+        for (j = i + 1; j < max_early_res && early_res[j].end; j++)
+                ;
+        memmove(&early_res[i], &early_res[i + 1],
+               (j - 1 - i) * sizeof(struct early_res));
+        early_res[j - 1].end = 0;
+        early_res_count--;
+}
+static void __init drop_range_partial(int i, u64 start, u64 end)
+{
+        u64 common_start, common_end;
+        u64 old_start, old_end;
+        old_start = early_res[i].start;
+        old_end = early_res[i].end;
+        common_start = max(old_start, start);
+        common_end = min(old_end, end);
+        /* no overlap ? */
+        if (common_start >= common_end)
+                return;
+        if (old_start < common_start) {
+                /* make head segment */
+                early_res[i].end = common_start;
+                if (old_end > common_end) {
+                        char name[15];
+                        /*
+                         * Save a local copy of the name, since the
+                         * early_res array could get resized inside
+                         * reserve_early_without_check() ->
+                         * __check_and_double_early_res(), which would
+                         * make the current name pointer invalid.
+                         */
+                        strncpy(name, early_res[i].name,
+                                         sizeof(early_res[i].name) - 1);
+                        /* add another for left over on tail */
+                        reserve_early_without_check(common_end, old_end, name);
+                }
+                return;
+        } else {
+                if (old_end > common_end) {
+                        /* reuse the entry for tail left */
+                        early_res[i].start = common_end;
+                        return;
+                }
+                /* all covered */
+                drop_range(i);
+        }
+}
+/*
+ * Split any existing ranges that:
+ *  1) are marked 'overlap_ok', and
+ *  2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range.  Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        u64 lower_start, lower_end;
+        u64 upper_start, upper_end;
+        char name[15];
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                /* Continue past non-overlapping ranges */
+                if (end <= r->start || start >= r->end)
+                        continue;
+                /*
+                 * Leave non-ok overlaps as is; let caller
+                 * panic "Overlapping early reservations"
+                 * when it hits this overlap.
+                 */
+                if (!r->overlap_ok)
+                        return;
+                /*
+                 * We have an ok overlap.  We will drop it from the early
+                 * reservation map, and add back in any non-overlapping
+                 * portions (lower or upper) as separate, overlap_ok,
+                 * non-overlapping ranges.
+                 */
+                /* 1. Note any non-overlapping (lower or upper) ranges. */
+                strncpy(name, r->name, sizeof(name) - 1);
+                lower_start = lower_end = 0;
+                upper_start = upper_end = 0;
+                if (r->start < start) {
+                        lower_start = r->start;
+                        lower_end = start;
+                }
+                if (r->end > end) {
+                        upper_start = end;
+                        upper_end = r->end;
+                }
+                /* 2. Drop the original ok overlapping range */
+                drop_range(i);
+                i--;            /* resume for-loop on copied down entry */
+                /* 3. Add back in any non-overlapping ranges. */
+                if (lower_end)
+                        reserve_early_overlap_ok(lower_start, lower_end, name);
+                if (upper_end)
+                        reserve_early_overlap_ok(upper_start, upper_end, name);
+        }
+}
+static void __init __reserve_early(u64 start, u64 end, char *name,
+                                                int overlap_ok)
+{
+        int i;
+        struct early_res *r;
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                panic("Too many early reservations");
+        r = &early_res[i];
+        if (r->end)
+                panic("Overlapping early reservations "
+                      "%llx-%llx %s to %llx-%llx %s\n",
+                      start, end - 1, name ? name : "", r->start,
+                      r->end - 1, r->name);
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = overlap_ok;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first.  We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 1);
+}
+static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
+{
+        u64 start, end, size, mem;
+        struct early_res *new;
+        /* do we have enough slots left ? */
+        if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+                return;
+        /* double it */
+        mem = -1ULL;
+        size = sizeof(struct early_res) * max_early_res * 2;
+        if (early_res == early_res_x)
+                start = 0;
+        else
+                start = early_res[0].end;
+        end = ex_start;
+        if (start + size < end)
+                mem = find_fw_memmap_area(start, end, size,
+                                         sizeof(struct early_res));
+        if (mem == -1ULL) {
+                start = ex_end;
+                end = get_max_mapped();
+                if (start + size < end)
+                        mem = find_fw_memmap_area(start, end, size,
+                                                 sizeof(struct early_res));
+        }
+        if (mem == -1ULL)
+                panic("can not find more space for early_res array");
+        new = __va(mem);
+        /* save the first one for own */
+        new[0].start = mem;
+        new[0].end = mem + size;
+        new[0].overlap_ok = 0;
+        /* copy old to new */
+        if (early_res == early_res_x) {
+                memcpy(&new[1], &early_res[0],
+                         sizeof(struct early_res) * max_early_res);
+                memset(&new[max_early_res+1], 0,
+                         sizeof(struct early_res) * (max_early_res - 1));
+                early_res_count++;
+        } else {
+                memcpy(&new[1], &early_res[1],
+                         sizeof(struct early_res) * (max_early_res - 1));
+                memset(&new[max_early_res], 0,
+                         sizeof(struct early_res) * max_early_res);
+        }
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = new;
+        max_early_res *= 2;
+        printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+                max_early_res, mem, mem + size - 1);
+}
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 0);
+}
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+        struct early_res *r;
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        r = &early_res[early_res_count];
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = 0;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+void __init free_early(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+        i = find_overlapped_early(start, end);
+        r = &early_res[i];
+        if (i >= max_early_res || r->end != end || r->start != start)
+                panic("free_early on not reserved area: %llx-%llx!",
+                         start, end - 1);
+        drop_range(i);
+}
+void __init free_early_partial(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+        if (start == end)
+                return;
+        if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
+                return;
+try_next:
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                return;
+        r = &early_res[i];
+        /* hole ? */
+        if (r->end >= end && r->start <= start) {
+                drop_range_partial(i, start, end);
+                return;
+        }
+        drop_range_partial(i, start, end);
+        goto try_next;
+}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+#define DEBUG_PRINT_EARLY_RES 1
+#if DEBUG_PRINT_EARLY_RES
+        printk(KERN_INFO "Subtract (%d early reservations)\n", count);
+#endif
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+#if DEBUG_PRINT_EARLY_RES
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
+                        r->start, r->end, r->name);
+#endif
+                final_start = PFN_DOWN(r->start);
+                final_end = PFN_UP(r->end);
+                if (final_start >= final_end)
+                        continue;
+                subtract_range(range, az, final_start, final_end);
+        }
+}
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+        int i, count;
+        u64 start = 0, end;
+        u64 size;
+        u64 mem;
+        struct range *range;
+        int nr_range;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        count *= 2;
+        size = sizeof(struct range) * count;
+        end = get_max_mapped();
+#ifdef MAX_DMA32_PFN
+        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
+                start = MAX_DMA32_PFN << PAGE_SHIFT;
+#endif
+        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
+        if (mem == -1ULL)
+                panic("can not find more space for range free");
+        range = __va(mem);
+        /* use early_node_map[] and early_res to get range array at first */
+        memset(range, 0, size);
+        nr_range = 0;
+        /* need to go over early_node_map to find out good range for node */
+        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+#ifdef CONFIG_X86_32
+        subtract_range(range, count, max_low_pfn, -1ULL);
+#endif
+        subtract_early_res(range, count);
+        nr_range = clean_sort_range(range, count);
+        /* need to clear it ? */
+        if (nodeid == MAX_NUMNODES) {
+                memset(&early_res[0], 0,
+                         sizeof(struct early_res) * max_early_res);
+                early_res = NULL;
+                max_early_res = 0;
+        }
+        *rangep = range;
+        return nr_range;
+}
+#else
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+        printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+                         count - idx, max_early_res, start, end);
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
+                        r->start, r->end, r->name);
+                final_start = max(start, r->start);
+                final_end = min(end, r->end);
+                if (final_start >= final_end) {
+                        printk(KERN_CONT "\n");
+                        continue;
+                }
+                printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+                        final_start, final_end);
+                reserve_bootmem_generic(final_start, final_end - final_start,
+                                BOOTMEM_DEFAULT);
+        }
+        /* clear them */
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = NULL;
+        max_early_res = 0;
+        early_res_count = 0;
+}
+#endif
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+        int i;
+        u64 addr = *addrp;
+        int changed = 0;
+        struct early_res *r;
+again:
+        i = find_overlapped_early(addr, addr + size);
+        r = &early_res[i];
+        if (i < max_early_res && r->end) {
+                *addrp = addr = round_up(r->end, align);
+                changed = 1;
+                goto again;
+        }
+        return changed;
+}
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+        int i;
+        u64 addr = *addrp, last;
+        u64 size = *sizep;
+        int changed = 0;
+again:
+        last = addr + size;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                struct early_res *r = &early_res[i];
+                if (last > r->start && addr < r->start) {
+                        size = r->start - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last > r->end && addr < r->end) {
+                        addr = round_up(r->end, align);
+                        size = last - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last <= r->end && addr >= r->start) {
+                        (*sizep)++;
+                        return 0;
+                }
+        }
+        if (changed) {
+                *addrp = addr;
+                *sizep = size;
+        }
+        return changed;
+}
+/*
+ * Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+                         u64 size, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+                ;
+        last = addr + size;
+        if (last > ei_last)
+                goto out;
+        if (last > end)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
+u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
+                         u64 *sizep, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        *sizep = ei_last - addr;
+        while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
+                ;
+        last = addr + *sizep;
+        if (last > ei_last)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/elf.h>
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+        return 0;
+}
+int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
+                                      unsigned long limit)
+{
+        return 1;
+}
+int __weak elf_core_write_extra_data(struct file *file, size_t *size,
+                                     unsigned long limit)
+{
+        return 1;
+}
+size_t __weak elf_core_extra_data_size(void)
+{
+        return 0;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 3da04257aeaf..256ce8c2ebc8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -69,10 +70,10 @@ static void __unhash_process(struct task_struct *p)
                detach_pid(p, PIDTYPE_SID);
                list_del_rcu(&p->tasks);
+                list_del_init(&p->sibling);
                __get_cpu_var(process_counts)--;
        }
        list_del_rcu(&p->thread_group);
-        list_del_init(&p->sibling);
 }
 /*
@@ -86,7 +87,9 @@ static void __exit_signal(struct task_struct *tsk)
        BUG_ON(!sig);
        BUG_ON(!atomic_read(&sig->count));
-        sighand = rcu_dereference(tsk->sighand);
+        sighand = rcu_dereference_check(tsk->sighand,
+                                        rcu_read_lock_held() ||
+                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);
        posix_cpu_timers_exit(tsk);
@@ -112,9 +115,9 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime = cputime_add(sig->utime, task_utime(tsk));
+                sig->utime = cputime_add(sig->utime, tsk->utime);
-                sig->stime = cputime_add(sig->stime, task_stime(tsk));
+                sig->stime = cputime_add(sig->stime, tsk->stime);
-                sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+                sig->gtime = cputime_add(sig->gtime, tsk->gtime);
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -171,8 +174,10 @@ void release_task(struct task_struct * p)
 repeat:
        tracehook_prepare_release_task(p);
        /* don't need to get the RCU readlock here - the process is dead and
-         * can't be modifying its own credentials */
+         * can't be modifying its own credentials. But shut RCU-lockdep up */
+        rcu_read_lock();
        atomic_dec(&__task_cred(p)->user->processes);
+        rcu_read_unlock();
        proc_flush_task(p);
@@ -474,9 +479,11 @@ static void close_files(struct files_struct * files)
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
-         * files structure.
+         * files structure.  But use RCU to shut RCU-lockdep up.
         */
+        rcu_read_lock();
        fdt = files_fdtable(files);
+        rcu_read_unlock();
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
@@ -522,10 +529,12 @@ void put_files_struct(struct files_struct *files)
                 * at the end of the RCU grace period. Otherwise,
                 * you can free files immediately.
                 */
+                rcu_read_lock();
                fdt = files_fdtable(files);
                if (fdt != &files->fdtab)
                        kmem_cache_free(files_cachep, files);
                free_fdtable(fdt);
+                rcu_read_unlock();
        }
 }
@@ -737,12 +746,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 /*
 * Any that need to be release_task'd are put on the @dead list.
 */
-static void reparent_thread(struct task_struct *father, struct task_struct *p,
+static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
 {
-        if (p->pdeath_signal)
-                group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
        list_move_tail(&p->sibling, &p->real_parent->children);
        if (task_detached(p))
@@ -781,12 +787,18 @@ static void forget_original_parent(struct task_struct *father)
        reaper = find_new_reaper(father);
        list_for_each_entry_safe(p, n, &father->children, sibling) {
-                p->real_parent = reaper;
+                struct task_struct *t = p;
-                if (p->parent == father) {
+                do {
-                        BUG_ON(task_ptrace(p));
+                        t->real_parent = reaper;
-                        p->parent = p->real_parent;
+                        if (t->parent == father) {
-                }
+                                BUG_ON(task_ptrace(t));
-                reparent_thread(father, p, &dead_children);
+                                t->parent = t->real_parent;
+                        }
+                        if (t->pdeath_signal)
+                                group_send_sig_info(t->pdeath_signal,
+                                                    SEND_SIG_NOINFO, t);
+                } while_each_thread(p, t);
+                reparent_leader(father, p, &dead_children);
        }
        write_unlock_irq(&tasklist_lock);
@@ -934,7 +946,7 @@ NORET_TYPE void do_exit(long code)
         * an exiting task cleaning up the robust pi futexes.
         */
        smp_mb();
-        spin_unlock_wait(&tsk->pi_lock);
+        raw_spin_unlock_wait(&tsk->pi_lock);
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -942,7 +954,9 @@ NORET_TYPE void do_exit(long code)
                                preempt_count());
        acct_update_integrals(tsk);
+        /* sync mm's RSS info before statistics gathering */
+        if (tsk->mm)
+                sync_mm_rss(tsk, tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
@@ -974,7 +988,7 @@ NORET_TYPE void do_exit(long code)
        exit_thread();
        cgroup_exit(tsk, 1);
-        if (group_dead && tsk->signal->leader)
+        if (group_dead)
                disassociate_ctty(1);
        module_put(task_thread_info(tsk)->exec_domain->module);
@@ -982,6 +996,10 @@ NORET_TYPE void do_exit(long code)
        proc_exit_connector(tsk);
        /*
+         * FIXME: do that only when needed, using sched_exit tracepoint
+         */
+        flush_ptrace_hw_breakpoint(tsk);
+        /*
         * Flush inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
         */
@@ -1008,7 +1026,7 @@ NORET_TYPE void do_exit(long code)
        tsk->flags |= PF_EXITPIDONE;
        if (tsk->io_context)
-                exit_io_context();
+                exit_io_context(tsk);
        if (tsk->splice_pipe)
                __free_pipe_info(tsk->splice_pipe);
@@ -1176,7 +1194,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        if (unlikely(wo->wo_flags & WNOWAIT)) {
                int exit_code = p->exit_code;
-                int why, status;
+                int why;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
@@ -1209,6 +1227,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                struct signal_struct *psig;
                struct signal_struct *sig;
                unsigned long maxrss;
+                cputime_t tgutime, tgstime;
                /*
                 * The resource counters for the group leader are in its
@@ -1224,20 +1243,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                 * need to protect the access to parent->signal fields,
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
+                 *
+                 * We use thread_group_times() to get times for the thread
+                 * group, which consolidates times for all threads in the
+                 * group including the group leader.
                 */
+                thread_group_times(p, &tgutime, &tgstime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
                psig->cutime =
                        cputime_add(psig->cutime,
-                        cputime_add(p->utime,
+                        cputime_add(tgutime,
-                        cputime_add(sig->utime,
+                                    sig->cutime));
-                                    sig->cutime)));
                psig->cstime =
                        cputime_add(psig->cstime,
-                        cputime_add(p->stime,
+                        cputime_add(tgstime,
-                        cputime_add(sig->stime,
+                                    sig->cstime));
-                                    sig->cstime)));
                psig->cgtime =
                        cputime_add(psig->cgtime,
                        cputime_add(p->gtime,
@@ -1546,14 +1568,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
        struct task_struct *p;
        list_for_each_entry(p, &tsk->children, sibling) {
-                /*
+                int ret = wait_consider_task(wo, 0, p);
-                 * Do not consider detached threads.
+                if (ret)
-                 */
+                        return ret;
-                if (!task_detached(p)) {
-                        int ret = wait_consider_task(wo, 0, p);
-                        if (ret)
-                                return ret;
-                }
        }
        return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9fad346d7029..166eb780dd7d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,7 @@
 #include <linux/magic.h>
 #include <linux/perf_event.h>
 #include <linux/posix-timers.h>
+#include <linux/user-return-notifier.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -89,6 +90,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+        return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
 int nr_processes(void)
 {
        int cpu;
@@ -256,6 +265,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                goto out;
        setup_thread_stack(tsk, orig);
+        clear_user_return_notifier(tsk);
        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
@@ -333,15 +343,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
+                INIT_LIST_HEAD(&tmp->anon_vma_chain);
                pol = mpol_dup(vma_policy(mpnt));
                retval = PTR_ERR(pol);
                if (IS_ERR(pol))
                        goto fail_nomem_policy;
                vma_set_policy(tmp, pol);
+                if (anon_vma_fork(tmp, mpnt))
+                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~VM_LOCKED;
                tmp->vm_mm = mm;
                tmp->vm_next = NULL;
-                anon_vma_link(tmp);
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
@@ -396,6 +408,8 @@ out:
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
        return retval;
+fail_nomem_anon_vma_fork:
+        mpol_put(pol);
 fail_nomem_policy:
        kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
@@ -459,8 +473,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
                (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
        mm->core_state = NULL;
        mm->nr_ptes = 0;
-        set_mm_counter(mm, file_rss, 0);
+        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
-        set_mm_counter(mm, anon_rss, 0);
        spin_lock_init(&mm->page_table_lock);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
@@ -829,23 +842,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 */
 static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
+        unsigned long cpu_limit;
        /* Thread group counters. */
        thread_group_cputime_init(sig);
-        /* Expiration times and increments. */
+        cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        sig->it[CPUCLOCK_PROF].expires = cputime_zero;
+        if (cpu_limit != RLIM_INFINITY) {
-        sig->it[CPUCLOCK_PROF].incr = cputime_zero;
+                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
-        sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
-        sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-        /* Cached expiration times. */
-        sig->cputime_expires.prof_exp = cputime_zero;
-        sig->cputime_expires.virt_exp = cputime_zero;
-        sig->cputime_expires.sched_exp = 0;
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-                sig->cputime_expires.prof_exp =
-                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
                sig->cputimer.running = 1;
        }
@@ -862,7 +866,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        if (clone_flags & CLONE_THREAD)
                return 0;
-        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;
@@ -870,43 +874,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        atomic_set(&sig->count, 1);
        atomic_set(&sig->live, 1);
        init_waitqueue_head(&sig->wait_chldexit);
-        sig->flags = 0;
        if (clone_flags & CLONE_NEWPID)
                sig->flags |= SIGNAL_UNKILLABLE;
-        sig->group_exit_code = 0;
-        sig->group_exit_task = NULL;
-        sig->group_stop_count = 0;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->leader = 0;        /* session leadership doesn't inherit */
-        sig->tty_old_pgrp = NULL;
-        sig->tty = NULL;
-        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
-        sig->gtime = cputime_zero;
-        sig->cgtime = cputime_zero;
-        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
-        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
-        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-        sig->maxrss = sig->cmaxrss = 0;
-        task_io_accounting_init(&sig->ioac);
-        sig->sum_sched_runtime = 0;
-        taskstats_tgid_init(sig);
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
        posix_cpu_timers_init_group(sig);
-        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
@@ -941,9 +923,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 static void rt_mutex_init_task(struct task_struct *p)
 {
-        spin_lock_init(&p->pi_lock);
+        raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&p->pi_waiters, &p->pi_lock);
+        plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
        p->pi_blocked_on = NULL;
 #endif
 }
@@ -1035,7 +1017,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        retval = -EAGAIN;
        if (atomic_read(&p->real_cred->user->processes) >=
-                        p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+                        task_rlimit(p, RLIMIT_NPROC)) {
                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
@@ -1073,8 +1055,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->gtime = cputime_zero;
        p->utimescaled = cputime_zero;
        p->stimescaled = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
        p->prev_utime = cputime_zero;
        p->prev_stime = cputime_zero;
+#endif
+#if defined(SPLIT_RSS_COUNTING)
+        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#endif
        p->default_timer_slack_ns = current->timer_slack_ns;
@@ -1127,11 +1114,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+        p->memcg_batch.do_batch = 0;
+        p->memcg_batch.memcg = NULL;
+#endif
        p->bts = NULL;
-        p->stack_start = stack_start;
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
@@ -1206,9 +1195,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                p->sas_ss_sp = p->sas_ss_size = 0;
        /*
-         * Syscall tracing should be turned off in the child regardless
+         * Syscall tracing and stepping should be turned off in the
-         * of CLONE_PTRACE.
+         * child regardless of CLONE_PTRACE.
         */
+        user_disable_single_step(p);
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
 #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1236,21 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
-        /*
-         * The task hasn't been attached yet, so its cpus_allowed mask will
-         * not be changed, nor will its assigned CPU.
-         *
-         * The cpus_allowed mask of the parent may have changed after it was
-         * copied first time - so re-copy it here, then check the child's CPU
-         * to ensure it is on a valid CPU (and if not, just force it back to
-         * parent's CPU). This avoids alot of nasty races.
-         */
-        p->cpus_allowed = current->cpus_allowed;
-        p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
-        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-                        !cpu_online(task_cpu(p))))
-                set_task_cpu(p, smp_processor_id());
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
@@ -1286,7 +1261,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        }
        if (likely(p->pid)) {
-                list_add_tail(&p->sibling, &p->real_parent->children);
                tracehook_finish_clone(p, clone_flags, trace);
                if (thread_group_leader(p)) {
@@ -1298,6 +1272,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        attach_pid(p, PIDTYPE_SID, task_session(current));
+                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;
                }
@@ -1317,7 +1292,8 @@ bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
 bad_fork_cleanup_io:
-        put_io_context(p->io_context);
+        if (p->io_context)
+                exit_io_context(p);
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
 * @uaddr:      virtual address of the futex
 * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
 * @key:        address where result is stored.
- * @rw:         mapping needs to be read/write (values: VERIFY_READ,
- *              VERIFY_WRITE)
 *
 * Returns a negative error code or 0
 * The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
 * lock_page() might sleep, the caller should not hold a spinlock.
 */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
-                if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
        }
 again:
-        err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
+        err = get_user_pages_fast(address, 1, 1, &page);
        if (err < 0)
                return err;
@@ -304,8 +302,14 @@ void put_futex_key(int fshared, union futex_key *key)
 */
 static int fault_in_user_writeable(u32 __user *uaddr)
 {
-        int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+        struct mm_struct *mm = current->mm;
-                                 1, 1, 0, NULL, NULL);
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, (unsigned long)uaddr,
+                             1, 1, 0, NULL, NULL);
+        up_read(&mm->mmap_sem);
        return ret < 0 ? ret : 0;
 }
@@ -397,9 +401,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
         * and has cleaned up the pi_state already
         */
        if (pi_state->owner) {
-                spin_lock_irq(&pi_state->owner->pi_lock);
+                raw_spin_lock_irq(&pi_state->owner->pi_lock);
                list_del_init(&pi_state->list);
-                spin_unlock_irq(&pi_state->owner->pi_lock);
+                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
                rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
        }
@@ -464,18 +468,18 @@ void exit_pi_state_list(struct task_struct *curr)
         * pi_state_list anymore, but we have to be careful
         * versus waiters unqueueing themselves:
         */
-        spin_lock_irq(&curr->pi_lock);
+        raw_spin_lock_irq(&curr->pi_lock);
        while (!list_empty(head)) {
                next = head->next;
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
                hb = hash_futex(&key);
-                spin_unlock_irq(&curr->pi_lock);
+                raw_spin_unlock_irq(&curr->pi_lock);
                spin_lock(&hb->lock);
-                spin_lock_irq(&curr->pi_lock);
+                raw_spin_lock_irq(&curr->pi_lock);
                /*
                 * We dropped the pi-lock, so re-check whether this
                 * task still owns the PI-state:
@@ -489,15 +493,15 @@ void exit_pi_state_list(struct task_struct *curr)
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
                pi_state->owner = NULL;
-                spin_unlock_irq(&curr->pi_lock);
+                raw_spin_unlock_irq(&curr->pi_lock);
                rt_mutex_unlock(&pi_state->pi_mutex);
                spin_unlock(&hb->lock);
-                spin_lock_irq(&curr->pi_lock);
+                raw_spin_lock_irq(&curr->pi_lock);
        }
-        spin_unlock_irq(&curr->pi_lock);
+        raw_spin_unlock_irq(&curr->pi_lock);
 }
 static int
@@ -526,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                                return -EINVAL;
                        WARN_ON(!atomic_read(&pi_state->refcount));
-                        WARN_ON(pid && pi_state->owner &&
-                                pi_state->owner->pid != pid);
+                        /*
+                         * When pi_state->owner is NULL then the owner died
+                         * and another waiter is on the fly. pi_state->owner
+                         * is fixed up by the task which acquires
+                         * pi_state->rt_mutex.
+                         *
+                         * We do not check for pid == 0 which can happen when
+                         * the owner died and robust_list_exit() cleared the
+                         * TID.
+                         */
+                        if (pid && pi_state->owner) {
+                                /*
+                                 * Bail out if user space manipulated the
+                                 * futex value.
+                                 */
+                                if (pid != task_pid_vnr(pi_state->owner))
+                                        return -EINVAL;
+                        }
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
@@ -552,7 +573,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
         * change of the task flags, we do this protected by
         * p->pi_lock:
         */
-        spin_lock_irq(&p->pi_lock);
+        raw_spin_lock_irq(&p->pi_lock);
        if (unlikely(p->flags & PF_EXITING)) {
                /*
                 * The task is on the way out. When PF_EXITPIDONE is
@@ -561,7 +582,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                 */
                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
-                spin_unlock_irq(&p->pi_lock);
+                raw_spin_unlock_irq(&p->pi_lock);
                put_task_struct(p);
                return ret;
        }
@@ -580,7 +601,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
-        spin_unlock_irq(&p->pi_lock);
+        raw_spin_unlock_irq(&p->pi_lock);
        put_task_struct(p);
@@ -754,7 +775,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!pi_state)
                return -EINVAL;
-        spin_lock(&pi_state->pi_mutex.wait_lock);
+        /*
+         * If current does not own the pi_state then the futex is
+         * inconsistent and user space fiddled with the futex value.
+         */
+        if (pi_state->owner != current)
+                return -EINVAL;
+        raw_spin_lock(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
        /*
@@ -783,23 +811,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                else if (curval != uval)
                        ret = -EINVAL;
                if (ret) {
-                        spin_unlock(&pi_state->pi_mutex.wait_lock);
+                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
                        return ret;
                }
        }
-        spin_lock_irq(&pi_state->owner->pi_lock);
+        raw_spin_lock_irq(&pi_state->owner->pi_lock);
        WARN_ON(list_empty(&pi_state->list));
        list_del_init(&pi_state->list);
-        spin_unlock_irq(&pi_state->owner->pi_lock);
+        raw_spin_unlock_irq(&pi_state->owner->pi_lock);
-        spin_lock_irq(&new_owner->pi_lock);
+        raw_spin_lock_irq(&new_owner->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &new_owner->pi_state_list);
        pi_state->owner = new_owner;
-        spin_unlock_irq(&new_owner->pi_lock);
+        raw_spin_unlock_irq(&new_owner->pi_lock);
-        spin_unlock(&pi_state->pi_mutex.wait_lock);
+        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
        rt_mutex_unlock(&pi_state->pi_mutex);
        return 0;
@@ -861,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
-        ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
+        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -907,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 retry:
-        ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1004,7 +1032,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
                plist_add(&q->list, &hb2->chain);
                q->lock_ptr = &hb2->lock;
 #ifdef CONFIG_DEBUG_PI_LIST
-                q->list.plist.lock = &hb2->lock;
+                q->list.plist.spinlock = &hb2->lock;
 #endif
        }
        get_futex_key_refs(key2);
@@ -1040,7 +1068,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
        q->lock_ptr = &hb->lock;
 #ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.lock = &hb->lock;
+        q->list.plist.spinlock = &hb->lock;
 #endif
        wake_up_state(q->task, TASK_NORMAL);
@@ -1169,11 +1197,10 @@ retry:
                pi_state = NULL;
        }
-        ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, fshared, &key2,
+        ret = get_futex_key(uaddr2, fshared, &key2);
-                            requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
@@ -1388,7 +1415,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
        plist_node_init(&q->list, prio);
 #ifdef CONFIG_DEBUG_PI_LIST
-        q->list.plist.lock = &hb->lock;
+        q->list.plist.spinlock = &hb->lock;
 #endif
        plist_add(&q->list, &hb->chain);
        q->task = current;
@@ -1523,18 +1550,18 @@ retry:
         * itself.
         */
        if (pi_state->owner != NULL) {
-                spin_lock_irq(&pi_state->owner->pi_lock);
+                raw_spin_lock_irq(&pi_state->owner->pi_lock);
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
-                spin_unlock_irq(&pi_state->owner->pi_lock);
+                raw_spin_unlock_irq(&pi_state->owner->pi_lock);
        }
        pi_state->owner = newowner;
-        spin_lock_irq(&newowner->pi_lock);
+        raw_spin_lock_irq(&newowner->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &newowner->pi_state_list);
-        spin_unlock_irq(&newowner->pi_lock);
+        raw_spin_unlock_irq(&newowner->pi_lock);
        return 0;
        /*
@@ -1732,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         */
 retry:
        q->key = FUTEX_KEY_INIT;
-        ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+        ret = get_futex_key(uaddr, fshared, &q->key);
        if (unlikely(ret != 0))
                return ret;
@@ -1898,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
        q.requeue_pi_key = NULL;
 retry:
        q.key = FUTEX_KEY_INIT;
-        ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
+        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out;
@@ -1968,7 +1995,7 @@ retry_private:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-        goto out;
+        goto out_put_key;
 out_unlock_put_key:
        queue_unlock(&q, hb);
@@ -2017,7 +2044,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
-        ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
+        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -2209,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        rt_waiter.task = NULL;
        key2 = FUTEX_KEY_INIT;
-        ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                struct task_struct *p;
                ret = -ESRCH;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                    !capable(CAP_SYS_PTRACE))
                        goto err_unlock;
                head = p->compat_robust_list;
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        }
        if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
        return put_user(ptr_to_compat(head), head_ptr);
 err_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 7b19403900ad..02e5097bf319 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -129,11 +129,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
        for (;;) {
                base = timer->base;
                if (likely(base != NULL)) {
-                        spin_lock_irqsave(&base->cpu_base->lock, *flags);
+                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
-                        spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
+                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
@@ -210,13 +210,13 @@ again:
                /* See the comment in lock_timer_base() */
                timer->base = NULL;
-                spin_unlock(&base->cpu_base->lock);
+                raw_spin_unlock(&base->cpu_base->lock);
-                spin_lock(&new_base->cpu_base->lock);
+                raw_spin_lock(&new_base->cpu_base->lock);
                if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
                        cpu = this_cpu;
-                        spin_unlock(&new_base->cpu_base->lock);
+                        raw_spin_unlock(&new_base->cpu_base->lock);
-                        spin_lock(&base->cpu_base->lock);
+                        raw_spin_lock(&base->cpu_base->lock);
                        timer->base = base;
                        goto again;
                }
@@ -232,7 +232,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
        struct hrtimer_clock_base *base = timer->base;
-        spin_lock_irqsave(&base->cpu_base->lock, *flags);
+        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
        return base;
 }
@@ -559,7 +559,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 static int hrtimer_reprogram(struct hrtimer *timer,
                             struct hrtimer_clock_base *base)
 {
-        ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
        int res;
@@ -584,7 +584,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        if (expires.tv64 < 0)
                return -ETIME;
-        if (expires.tv64 >= expires_next->tv64)
+        if (expires.tv64 >= cpu_base->expires_next.tv64)
+                return 0;
+        /*
+         * If a hang was detected in the last timer interrupt then we
+         * do not schedule a timer which is earlier than the expiry
+         * which we enforced in the hang detection. We want the system
+         * to make progress.
+         */
+        if (cpu_base->hang_detected)
                return 0;
        /*
@@ -592,7 +601,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
         */
        res = tick_program_event(expires, 0);
        if (!IS_ERR_VALUE(res))
-                *expires_next = expires;
+                cpu_base->expires_next = expires;
        return res;
 }
@@ -621,12 +630,12 @@ static void retrigger_next_event(void *arg)
        base = &__get_cpu_var(hrtimer_bases);
        /* Adjust CLOCK_REALTIME offset */
-        spin_lock(&base->lock);
+        raw_spin_lock(&base->lock);
        base->clock_base[CLOCK_REALTIME].offset =
                timespec_to_ktime(realtime_offset);
        hrtimer_force_reprogram(base, 0);
-        spin_unlock(&base->lock);
+        raw_spin_unlock(&base->lock);
 }
 /*
@@ -687,9 +696,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 {
        if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
                if (wakeup) {
-                        spin_unlock(&base->cpu_base->lock);
+                        raw_spin_unlock(&base->cpu_base->lock);
                        raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                        spin_lock(&base->cpu_base->lock);
+                        raw_spin_lock(&base->cpu_base->lock);
                } else
                        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
@@ -749,17 +758,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
-#ifdef CONFIG_TIMER_STATS
+static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
 {
+#ifdef CONFIG_TIMER_STATS
        if (timer->start_site)
                return;
+        timer->start_site = __builtin_return_address(0);
-        timer->start_site = addr;
        memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
        timer->start_pid = current->pid;
+#endif
 }
+static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        timer->start_site = NULL;
+#endif
+}
+static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        if (likely(!timer_stats_active))
+                return;
+        timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+                                 timer->function, timer->start_comm, 0);
 #endif
+}
 /*
 * Counterpart to lock_hrtimer_base above:
@@ -767,7 +792,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
 static inline
 void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
-        spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
+        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
 }
 /**
@@ -1027,9 +1052,9 @@ void hrtimer_pull(void)
        struct hrtimer_start_on_info *info;
        struct list_head *pos, *safe, list;
-        spin_lock(&base->lock);
+        raw_spin_lock(&base->lock);
        list_replace_init(&base->to_pull, &list);
-        spin_unlock(&base->lock);
+        raw_spin_unlock(&base->lock);
        list_for_each_safe(pos, safe, &list) {
                info = list_entry(pos, struct hrtimer_start_on_info, list);
@@ -1083,10 +1108,10 @@ int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
                } else {
                        TRACE("hrtimer_start_on: pulling to remote CPU\n");
                        base = &per_cpu(hrtimer_bases, cpu);
-                        spin_lock_irqsave(&base->lock, flags);
+                        raw_spin_lock_irqsave(&base->lock, flags);
                        was_empty = list_empty(&base->to_pull);
                        list_add(&info->list, &base->to_pull);
-                        spin_unlock_irqrestore(&base->lock, flags);
+                        raw_spin_unlock_irqrestore(&base->lock, flags);
                        if (was_empty)
                                /* only send IPI if other no else
                                 * has done so already
@@ -1179,7 +1204,7 @@ ktime_t hrtimer_get_next_event(void)
        unsigned long flags;
        int i;
-        spin_lock_irqsave(&cpu_base->lock, flags);
+        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        if (!hrtimer_hres_active()) {
                for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1196,7 +1221,7 @@ ktime_t hrtimer_get_next_event(void)
                }
        }
-        spin_unlock_irqrestore(&cpu_base->lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        if (mindelta.tv64 < 0)
                mindelta.tv64 = 0;
@@ -1278,11 +1303,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
         * they get migrated to another cpu, therefore its safe to unlock
         * the timer base.
         */
-        spin_unlock(&cpu_base->lock);
+        raw_spin_unlock(&cpu_base->lock);
        trace_hrtimer_expire_entry(timer, now);
        restart = fn(timer);
        trace_hrtimer_expire_exit(timer);
-        spin_lock(&cpu_base->lock);
+        raw_spin_lock(&cpu_base->lock);
        /*
         * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1298,29 +1323,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 #ifdef CONFIG_HIGH_RES_TIMERS
-static int force_clock_reprogram;
-/*
- * After 5 iteration's attempts, we consider that hrtimer_interrupt()
- * is hanging, which could happen with something that slows the interrupt
- * such as the tracing. Then we force the clock reprogramming for each future
- * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
- * threshold that we will overwrite.
- * The next tick event will be scheduled to 3 times we currently spend on
- * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
- * 1/4 of their time to process the hrtimer interrupts. This is enough to
- * let it running without serious starvation.
- */
-static inline void
-hrtimer_interrupt_hanging(struct clock_event_device *dev,
-                        ktime_t try_time)
-{
-        force_clock_reprogram = 1;
-        dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
-        printk(KERN_WARNING "hrtimer: interrupt too slow, "
-                "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
-}
 /*
 * High resolution timer interrupt
 * Called with interrupts disabled
@@ -1329,24 +1331,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 {
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
-        ktime_t expires_next, now;
+        ktime_t expires_next, now, entry_time, delta;
-        int nr_retries = 0;
+        int i, retries = 0;
-        int i;
        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
- retry:
+        entry_time = now = ktime_get();
-        /* 5 retries is enough to notice a hang */
+retry:
-        if (!(++nr_retries % 5))
-                hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
-        now = ktime_get();
        expires_next.tv64 = KTIME_MAX;
-        spin_lock(&cpu_base->lock);
+        raw_spin_lock(&cpu_base->lock);
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1402,13 +1398,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
         * against it.
         */
        cpu_base->expires_next = expires_next;
-        spin_unlock(&cpu_base->lock);
+        raw_spin_unlock(&cpu_base->lock);
        /* Reprogramming necessary ? */
-        if (expires_next.tv64 != KTIME_MAX) {
+        if (expires_next.tv64 == KTIME_MAX ||
-                if (tick_program_event(expires_next, force_clock_reprogram))
+            !tick_program_event(expires_next, 0)) {
-                        goto retry;
+                cpu_base->hang_detected = 0;
+                return;
        }
+        /*
+         * The next timer was already expired due to:
+         * - tracing
+         * - long lasting callbacks
+         * - being scheduled away when running in a VM
+         *
+         * We need to prevent that we loop forever in the hrtimer
+         * interrupt routine. We give it 3 attempts to avoid
+         * overreacting on some spurious event.
+         */
+        now = ktime_get();
+        cpu_base->nr_retries++;
+        if (++retries < 3)
+                goto retry;
+        /*
+         * Give the system a chance to do something else than looping
+         * here. We stored the entry time, so we know exactly how long
+         * we spent here. We schedule the next event this amount of
+         * time away.
+         */
+        cpu_base->nr_hangs++;
+        cpu_base->hang_detected = 1;
+        delta = ktime_sub(now, entry_time);
+        if (delta.tv64 > cpu_base->max_hang_time.tv64)
+                cpu_base->max_hang_time = delta;
+        /*
+         * Limit it to a sensible value as we enforce a longer
+         * delay. Give the CPU at least 100ms to catch up.
+         */
+        if (delta.tv64 > 100 * NSEC_PER_MSEC)
+                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
+        else
+                expires_next = ktime_add(now, delta);
+        tick_program_event(expires_next, 1);
+        printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
+                    ktime_to_ns(delta));
 }
 /*
@@ -1504,7 +1538,7 @@ void hrtimer_run_queues(void)
                        gettime = 0;
                }
-                spin_lock(&cpu_base->lock);
+                raw_spin_lock(&cpu_base->lock);
                while ((node = base->first)) {
                        struct hrtimer *timer;
@@ -1516,7 +1550,7 @@ void hrtimer_run_queues(void)
                        __run_hrtimer(timer, &base->softirq_time);
                }
-                spin_unlock(&cpu_base->lock);
+                raw_spin_unlock(&cpu_base->lock);
        }
 }
@@ -1672,7 +1706,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
-        spin_lock_init(&cpu_base->lock);
+        raw_spin_lock_init(&cpu_base->lock);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
                cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1731,16 +1765,16 @@ static void migrate_hrtimers(int scpu)
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
-        spin_lock(&new_base->lock);
+        raw_spin_lock(&new_base->lock);
-        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+        raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
                                     &new_base->clock_base[i]);
        }
-        spin_unlock(&old_base->lock);
+        raw_spin_unlock(&old_base->lock);
-        spin_unlock(&new_base->lock);
+        raw_spin_unlock(&new_base->lock);
        /* Check, if we got expired work to do */
        __hrtimer_peek_ahead_timers();
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d4e841747400..0c642d51aac2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
        rcu_read_lock();
        do_each_thread(g, t) {
-                if (!--max_count)
+                if (!max_count--)
                        goto unlock;
                if (!--batch_count) {
                        batch_count = HUNG_TASK_BATCHING;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..03808ed342a6
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,492 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
+ */
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/hw_breakpoint.h>
+/*
+ * Constraints data
+ */
+/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+/* Number of pinned task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
+/* Number of non-pinned cpu/task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+        unsigned int pinned;
+        unsigned int flexible;
+};
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu)
+{
+        int i;
+        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+        for (i = HBP_NUM -1; i >= 0; i--) {
+                if (tsk_pinned[i] > 0)
+                        return i + 1;
+        }
+        return 0;
+}
+static int task_bp_pinned(struct task_struct *tsk)
+{
+        struct perf_event_context *ctx = tsk->perf_event_ctxp;
+        struct list_head *list;
+        struct perf_event *bp;
+        unsigned long flags;
+        int count = 0;
+        if (WARN_ONCE(!ctx, "No perf context for this task"))
+                return 0;
+        list = &ctx->event_list;
+        raw_spin_lock_irqsave(&ctx->lock, flags);
+        /*
+         * The current breakpoint counter is not included in the list
+         * at the open() callback time
+         */
+        list_for_each_entry(bp, list, event_entry) {
+                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+                        count++;
+        }
+        raw_spin_unlock_irqrestore(&ctx->lock, flags);
+        return count;
+}
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
+{
+        int cpu = bp->cpu;
+        struct task_struct *tsk = bp->ctx->task;
+        if (cpu >= 0) {
+                slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+                if (!tsk)
+                        slots->pinned += max_task_bp_pinned(cpu);
+                else
+                        slots->pinned += task_bp_pinned(tsk);
+                slots->flexible = per_cpu(nr_bp_flexible, cpu);
+                return;
+        }
+        for_each_online_cpu(cpu) {
+                unsigned int nr;
+                nr = per_cpu(nr_cpu_bp_pinned, cpu);
+                if (!tsk)
+                        nr += max_task_bp_pinned(cpu);
+                else
+                        nr += task_bp_pinned(tsk);
+                if (nr > slots->pinned)
+                        slots->pinned = nr;
+                nr = per_cpu(nr_bp_flexible, cpu);
+                if (nr > slots->flexible)
+                        slots->flexible = nr;
+        }
+}
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+{
+        unsigned int *tsk_pinned;
+        int count = 0;
+        count = task_bp_pinned(tsk);
+        tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+        if (enable) {
+                tsk_pinned[count]++;
+                if (count > 0)
+                        tsk_pinned[count-1]--;
+        } else {
+                tsk_pinned[count]--;
+                if (count > 0)
+                        tsk_pinned[count-1]++;
+        }
+}
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void toggle_bp_slot(struct perf_event *bp, bool enable)
+{
+        int cpu = bp->cpu;
+        struct task_struct *tsk = bp->ctx->task;
+        /* Pinned counter task profiling */
+        if (tsk) {
+                if (cpu >= 0) {
+                        toggle_bp_task_slot(tsk, cpu, enable);
+                        return;
+                }
+                for_each_online_cpu(cpu)
+                        toggle_bp_task_slot(tsk, cpu, enable);
+                return;
+        }
+        /* Pinned counter cpu profiling */
+        if (enable)
+                per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+        else
+                per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+}
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ *  == Non-pinned counter == (Considered as pinned for now)
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
+ *
+ *       -> If there are already non-pinned counters in this cpu, it means
+ *          there is already a free slot for them.
+ *          Otherwise, we check that the maximum number of per task
+ *          breakpoints (for this cpu) plus the number of per cpu breakpoint
+ *          (for this cpu) doesn't cover every registers.
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
+ *
+ *       -> This is roughly the same, except we check the number of per cpu
+ *          bp for every cpu and we keep the max one. Same for the per tasks
+ *          breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
+ *
+ *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *          one register at least (or they will never be fed).
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
+ */
+static int __reserve_bp_slot(struct perf_event *bp)
+{
+        struct bp_busy_slots slots = {0};
+        fetch_bp_busy_slots(&slots, bp);
+        /* Flexible counters need to keep at least one slot */
+        if (slots.pinned + (!!slots.flexible) == HBP_NUM)
+                return -ENOSPC;
+        toggle_bp_slot(bp, true);
+        return 0;
+}
+int reserve_bp_slot(struct perf_event *bp)
+{
+        int ret;
+        mutex_lock(&nr_bp_mutex);
+        ret = __reserve_bp_slot(bp);
+        mutex_unlock(&nr_bp_mutex);
+        return ret;
+}
+static void __release_bp_slot(struct perf_event *bp)
+{
+        toggle_bp_slot(bp, false);
+}
+void release_bp_slot(struct perf_event *bp)
+{
+        mutex_lock(&nr_bp_mutex);
+        __release_bp_slot(bp);
+        mutex_unlock(&nr_bp_mutex);
+}
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        return __reserve_bp_slot(bp);
+}
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        __release_bp_slot(bp);
+        return 0;
+}
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+        int ret;
+        ret = reserve_bp_slot(bp);
+        if (ret)
+                return ret;
+        /*
+         * Ptrace breakpoints can be temporary perf events only
+         * meant to reserve a slot. In this case, it is created disabled and
+         * we don't want to check the params right now (as we put a null addr)
+         * But perf tools create events as disabled and we want to check
+         * the params for them.
+         * This is a quick hack that will be removed soon, once we remove
+         * the tmp breakpoints from ptrace
+         */
+        if (!bp->attr.disabled || !bp->overflow_handler)
+                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        /* if arch_validate_hwbkpt_settings() fails then release bp slot */
+        if (ret)
+                release_bp_slot(bp);
+        return ret;
+}
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+                            perf_overflow_handler_t triggered,
+                            struct task_struct *tsk)
+{
+        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @attr: new breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
+{
+        u64 old_addr = bp->attr.bp_addr;
+        u64 old_len = bp->attr.bp_len;
+        int old_type = bp->attr.bp_type;
+        int err = 0;
+        perf_event_disable(bp);
+        bp->attr.bp_addr = attr->bp_addr;
+        bp->attr.bp_type = attr->bp_type;
+        bp->attr.bp_len = attr->bp_len;
+        if (attr->disabled)
+                goto end;
+        err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        if (!err)
+                perf_event_enable(bp);
+        if (err) {
+                bp->attr.bp_addr = old_addr;
+                bp->attr.bp_type = old_type;
+                bp->attr.bp_len = old_len;
+                if (!bp->attr.disabled)
+                        perf_event_enable(bp);
+                return err;
+        }
+end:
+        bp->attr.disabled = attr->disabled;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+/**
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @bp: the breakpoint structure to unregister
+ */
+void unregister_hw_breakpoint(struct perf_event *bp)
+{
+        if (!bp)
+                return;
+        perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+/**
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ *
+ * @return a set of per_cpu pointers to perf events
+ */
+struct perf_event * __percpu *
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+                            perf_overflow_handler_t triggered)
+{
+        struct perf_event * __percpu *cpu_events, **pevent, *bp;
+        long err;
+        int cpu;
+        cpu_events = alloc_percpu(typeof(*cpu_events));
+        if (!cpu_events)
+                return (void __percpu __force *)ERR_PTR(-ENOMEM);
+        get_online_cpus();
+        for_each_online_cpu(cpu) {
+                pevent = per_cpu_ptr(cpu_events, cpu);
+                bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+                *pevent = bp;
+                if (IS_ERR(bp)) {
+                        err = PTR_ERR(bp);
+                        goto fail;
+                }
+        }
+        put_online_cpus();
+        return cpu_events;
+fail:
+        for_each_online_cpu(cpu) {
+                pevent = per_cpu_ptr(cpu_events, cpu);
+                if (IS_ERR(*pevent))
+                        break;
+                unregister_hw_breakpoint(*pevent);
+        }
+        put_online_cpus();
+        free_percpu(cpu_events);
+        return (void __percpu __force *)ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
+/**
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
+ */
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
+{
+        int cpu;
+        struct perf_event **pevent;
+        for_each_possible_cpu(cpu) {
+                pevent = per_cpu_ptr(cpu_events, cpu);
+                unregister_hw_breakpoint(*pevent);
+        }
+        free_percpu(cpu_events);
+}
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+        .notifier_call = hw_breakpoint_exceptions_notify,
+        /* we need to be notified first */
+        .priority = 0x7fffffff
+};
+static int __init init_hw_breakpoint(void)
+{
+        return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+core_initcall(init_hw_breakpoint);
+struct pmu perf_ops_bp = {
+        .enable         = arch_install_hw_breakpoint,
+        .disable        = arch_uninstall_hw_breakpoint,
+        .read           = hw_breakpoint_pmu_read,
+};
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416e..2295a31ef110 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
         * flush such a longstanding irq before considering it as spurious.
         */
        for_each_irq_desc_reverse(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        /*
                         * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
                                desc->chip->set_type(i, IRQ_TYPE_PROBE);
                        desc->chip->startup(i);
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
         * happened in the previous stage, it may have masked itself)
         */
        for_each_irq_desc_reverse(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
                        if (desc->chip->startup(i))
                                desc->status |= IRQ_PENDING;
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
         * Now filter out any obviously spurious interrupts
         */
        for_each_irq_desc(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                status = desc->status;
                if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
                                if (i < 32)
                                        mask |= 1 << i;
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
        int i;
        for_each_irq_desc(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                status = desc->status;
                if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
                        desc->status = status & ~IRQ_AUTODETECT;
                        desc->chip->shutdown(i);
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        mutex_unlock(&probing_active);
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
        unsigned int status;
        for_each_irq_desc(i, desc) {
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                status = desc->status;
                if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
                        desc->status = status & ~IRQ_AUTODETECT;
                        desc->chip->shutdown(i);
                }
-                spin_unlock_irq(&desc->lock);
+                raw_spin_unlock_irq(&desc->lock);
        }
        mutex_unlock(&probing_active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c1660194d115..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
 #include "internals.h"
-/**
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
- *      dynamic_irq_init - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
 {
        struct irq_desc *desc;
        unsigned long flags;
@@ -34,14 +30,15 @@ void dynamic_irq_init(unsigned int irq)
        }
        /* Ensure we don't have left over values from a previous use of this irq */
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->status = IRQ_DISABLED;
        desc->chip = &no_irq_chip;
        desc->handle_irq = handle_bad_irq;
        desc->depth = 1;
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->action = NULL;
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
@@ -51,14 +48,30 @@ void dynamic_irq_init(unsigned int irq)
        cpumask_clear(desc->pending_mask);
 #endif
 #endif
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 /**
- *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      dynamic_irq_init - initialize a dynamically allocated irq
 *      @irq:   irq number to initialize
 */
-void dynamic_irq_cleanup(unsigned int irq)
+void dynamic_irq_init(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, false);
+}
+/**
+ *      dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, true);
+}
+static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -68,21 +81,42 @@ void dynamic_irq_cleanup(unsigned int irq)
                return;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        if (desc->action) {
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
                        irq);
                return;
        }
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->handle_irq = handle_bad_irq;
        desc->chip = &no_irq_chip;
        desc->name = NULL;
        clear_kstat_irqs(desc);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+/**
+ *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, false);
+}
+/**
+ *      dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, true);
 }
@@ -104,10 +138,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        if (!chip)
                chip = &no_irq_chip;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        irq_chip_set_defaults(chip);
        desc->chip = chip;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -133,9 +167,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
        if (type == IRQ_TYPE_NONE)
                return 0;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        ret = __irq_set_trigger(desc, irq, type);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(set_irq_type);
@@ -158,19 +192,19 @@ int set_irq_data(unsigned int irq, void *data)
                return -EINVAL;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->handler_data = data;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
 EXPORT_SYMBOL(set_irq_data);
 /**
- *      set_irq_data - set irq type data for an irq
+ *      set_irq_msi - set MSI descriptor data for an irq
 *      @irq:   Interrupt number
 *      @entry: Pointer to MSI descriptor data
 *
- *      Set the hardware irq controller data for an irq
+ *      Set the MSI descriptor entry for an irq
 */
 int set_irq_msi(unsigned int irq, struct msi_desc *entry)
 {
@@ -183,11 +217,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
                return -EINVAL;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->msi_desc = entry;
        if (entry)
                entry->irq = irq;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -214,9 +248,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
                return -EINVAL;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->chip_data = data;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -241,12 +275,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
        if (!desc)
                return;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        if (nest)
                desc->status |= IRQ_NESTED_THREAD;
        else
                desc->status &= ~IRQ_NESTED_THREAD;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL_GPL(set_irq_nested_thread);
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
                if (desc->chip->ack)
                        desc->chip->ack(irq);
        }
+        desc->status |= IRQ_MASKED;
+}
+static inline void mask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->mask) {
+                desc->chip->mask(irq);
+                desc->status |= IRQ_MASKED;
+        }
+}
+static inline void unmask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->unmask) {
+                desc->chip->unmask(irq);
+                desc->status &= ~IRQ_MASKED;
+        }
 }
 /*
@@ -343,7 +394,7 @@ void handle_nested_irq(unsigned int irq)
        might_sleep();
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -352,17 +403,17 @@ void handle_nested_irq(unsigned int irq)
                goto out_unlock;
        desc->status |= IRQ_INPROGRESS;
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
        action_ret = action->thread_fn(action->irq, action->dev_id);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_nested_irq);
@@ -384,7 +435,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
        struct irqaction *action;
        irqreturn_t action_ret;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -396,16 +447,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
                goto out_unlock;
        desc->status |= IRQ_INPROGRESS;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        action_ret = handle_IRQ_event(irq, action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 /**
@@ -424,7 +475,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        struct irqaction *action;
        irqreturn_t action_ret;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        mask_ack_irq(desc, irq);
        if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,21 +492,19 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
                goto out_unlock;
        desc->status |= IRQ_INPROGRESS;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        action_ret = handle_IRQ_event(irq, action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
-        if (unlikely(desc->status & IRQ_ONESHOT))
+        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
-                desc->status |= IRQ_MASKED;
+                unmask_irq(desc, irq);
-        else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-                desc->chip->unmask(irq);
 out_unlock:
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 EXPORT_SYMBOL_GPL(handle_level_irq);
@@ -475,7 +524,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        struct irqaction *action;
        irqreturn_t action_ret;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out;
@@ -490,25 +539,24 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        action = desc->action;
        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                desc->status |= IRQ_PENDING;
-                if (desc->chip->mask)
+                mask_irq(desc, irq);
-                        desc->chip->mask(irq);
                goto out;
        }
        desc->status |= IRQ_INPROGRESS;
        desc->status &= ~IRQ_PENDING;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        action_ret = handle_IRQ_event(irq, action);
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
 out:
        desc->chip->eoi(irq);
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 /**
@@ -520,7 +568,7 @@ out:
 *      signal. The occurence is latched into the irq controller hardware
 *      and must be acked in order to be reenabled. After the ack another
 *      interrupt can happen on the same source even before the first one
- *      is handled by the assosiacted event handler. If this happens it
+ *      is handled by the associated event handler. If this happens it
 *      might be necessary to disable (mask) the interrupt depending on the
 *      controller hardware. This requires to reenable the interrupt inside
 *      of the loop which handles the interrupts which have arrived while
@@ -530,7 +578,7 @@ out:
 void
 handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                irqreturn_t action_ret;
                if (unlikely(!action)) {
-                        desc->chip->mask(irq);
+                        mask_irq(desc, irq);
                        goto out_unlock;
                }
@@ -571,26 +619,25 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                if (unlikely((desc->status &
                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
                              (IRQ_PENDING | IRQ_MASKED))) {
-                        desc->chip->unmask(irq);
+                        unmask_irq(desc, irq);
-                        desc->status &= ~IRQ_MASKED;
                }
                desc->status &= ~IRQ_PENDING;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                action_ret = handle_IRQ_event(irq, action);
                if (!noirqdebug)
                        note_interrupt(irq, desc, action_ret);
-                spin_lock(&desc->lock);
+                raw_spin_lock(&desc->lock);
        } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
        desc->status &= ~IRQ_INPROGRESS;
 out_unlock:
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
 }
 /**
- *      handle_percpu_IRQ - Per CPU local irq handler
+ *      handle_percpu_irq - Per CPU local irq handler
 *      @irq:   the interrupt number
 *      @desc:  the interrupt description structure for this irq
 *
@@ -643,7 +690,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        }
        chip_bus_lock(irq, desc);
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        /* Uninstall? */
        if (handle == handle_bad_irq) {
@@ -661,7 +708,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                desc->depth = 0;
                desc->chip->startup(irq);
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        __set_irq_handler(irq, handle, 0, name);
 }
-void __init set_irq_noprobe(unsigned int irq)
+void set_irq_noprobe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -692,12 +739,12 @@ void __init set_irq_noprobe(unsigned int irq)
                return;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->status |= IRQ_NOPROBE;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
-void __init set_irq_probe(unsigned int irq)
+void set_irq_probe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -707,7 +754,7 @@ void __init set_irq_probe(unsigned int irq)
                return;
        }
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->status &= ~IRQ_NOPROBE;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 *      automatically freed on driver detach.
 *
 *      If an IRQ allocated with this function needs to be freed
- *      separately, dev_free_irq() must be used.
+ *      separately, devm_free_irq() must be used.
 */
 int devm_request_threaded_irq(struct device *dev, unsigned int irq,
                              irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
 *      Except for the extra @dev argument, this function takes the
 *      same arguments and performs the same function as free_irq().
 *      This function instead of free_irq() should be used to manually
- *      free IRQs allocated with dev_request_irq().
+ *      free IRQs allocated with devm_request_irq().
 */
 void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 17c71bb565c6..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 #include "internals.h"
@@ -80,19 +80,15 @@ static struct irq_desc irq_desc_init = {
        .chip       = &no_irq_chip,
        .handle_irq = handle_bad_irq,
        .depth      = 1,
-        .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+        .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
        void *ptr;
-        if (slab_is_available())
+        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                           GFP_ATOMIC, node);
-                                   GFP_ATOMIC, node);
-        else
-                ptr = alloc_bootmem_node(NODE_DATA(node),
-                                nr * sizeof(*desc->kstat_irqs));
        /*
         * don't overwite if can not get new one
@@ -108,7 +104,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-        spin_lock_init(&desc->lock);
+        raw_spin_lock_init(&desc->lock);
        desc->irq = irq;
 #ifdef CONFIG_SMP
        desc->node = node;
@@ -130,9 +126,28 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 /*
 * Protect the sparse_irqs:
 */
-DEFINE_SPINLOCK(sparse_irq_lock);
+DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-struct irq_desc **irq_desc_ptrs __read_mostly;
+static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
+static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return radix_tree_lookup(&irq_desc_tree, irq);
+}
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        void **ptr;
+        ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
+        if (ptr)
+                radix_tree_replace_slot(ptr, desc);
+}
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS_LEGACY-1] = {
@@ -141,7 +156,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
                .chip       = &no_irq_chip,
                .handle_irq = handle_bad_irq,
                .depth      = 1,
-                .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+                .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
        }
 };
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
        legacy_count = ARRAY_SIZE(irq_desc_legacy);
        node = first_online_node;
-        /* allocate irq_desc_ptrs array based on nr_irqs */
-        irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
        /* allocate based on nr_cpu_ids */
        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
                                          sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
                alloc_desc_masks(&desc[i], node, true);
                init_desc_masks(&desc[i]);
-                irq_desc_ptrs[i] = desc + i;
+                set_irq_desc(i, &desc[i]);
        }
-        for (i = legacy_count; i < nr_irqs; i++)
-                irq_desc_ptrs[i] = NULL;
        return arch_early_irq_init();
 }
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        if (irq_desc_ptrs && irq < nr_irqs)
-                return irq_desc_ptrs[irq];
-        return NULL;
-}
 struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
                return NULL;
        }
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                return desc;
-        spin_lock_irqsave(&sparse_irq_lock, flags);
+        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                goto out_unlock;
-        if (slab_is_available())
+        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-                desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        else
-                desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
        if (!desc) {
@@ -231,10 +229,10 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
        }
        init_one_irq_desc(irq, desc, node);
-        irq_desc_ptrs[irq] = desc;
+        set_irq_desc(irq, desc);
 out_unlock:
-        spin_unlock_irqrestore(&sparse_irq_lock, flags);
+        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        return desc;
 }
@@ -247,7 +245,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
                .chip = &no_irq_chip,
                .handle_irq = handle_bad_irq,
                .depth = 1,
-                .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
+                .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
        }
 };
@@ -473,7 +471,7 @@ unsigned int __do_IRQ(unsigned int irq)
                return 1;
        }
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        if (desc->chip->ack)
                desc->chip->ack(irq);
        /*
@@ -517,13 +515,13 @@ unsigned int __do_IRQ(unsigned int irq)
        for (;;) {
                irqreturn_t action_ret;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                action_ret = handle_IRQ_event(irq, action);
                if (!noirqdebug)
                        note_interrupt(irq, desc, action_ret);
-                spin_lock(&desc->lock);
+                raw_spin_lock(&desc->lock);
                if (likely(!(desc->status & IRQ_PENDING)))
                        break;
                desc->status &= ~IRQ_PENDING;
@@ -536,7 +534,7 @@ out:
         * disabled while the handler was running.
         */
        desc->chip->end(irq);
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        return 1;
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a77..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,14 +18,10 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
-extern spinlock_t sparse_irq_lock;
+extern raw_spinlock_t sparse_irq_lock;
 #ifdef CONFIG_SPARSE_IRQ
-/* irq_desc_ptrs allocated at boot time */
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
-extern struct irq_desc **irq_desc_ptrs;
-#else
-/* irq_desc_ptrs is a fixed size array */
-extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
 #endif
 #ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24d..704e488730a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
                        cpu_relax();
                /* Ok, that indicated we're done: double-check carefully. */
-                spin_lock_irqsave(&desc->lock, flags);
+                raw_spin_lock_irqsave(&desc->lock, flags);
                status = desc->status;
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
                /* Oops, that failed? */
        } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        if (!desc->chip->set_affinity)
                return -EINVAL;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        }
 #endif
        desc->status |= IRQ_AFFINITY_SET;
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
 }
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
        unsigned long flags;
        int ret;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        ret = setup_affinity(irq, desc);
        if (!ret)
                irq_set_thread_affinity(desc);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
                return;
        chip_bus_lock(irq, desc);
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        __disable_irq(desc, irq, false);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
                return;
        chip_bus_lock(irq, desc);
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        __enable_irq(desc, irq, false);
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        chip_bus_sync_unlock(irq, desc);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        if (on) {
                if (desc->wake_depth++ == 0) {
                        ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
                }
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(set_irq_wake);
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action;
+        unsigned long flags;
        if (!desc)
                return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
        if (desc->status & IRQ_NOREQUEST)
                return 0;
+        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        if (action)
                if (irqflags & action->flags & IRQF_SHARED)
                        action = NULL;
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return !action;
 }
@@ -483,13 +487,31 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+again:
        chip_bus_lock(irq, desc);
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
+        /*
+         * Implausible though it may be we need to protect us against
+         * the following scenario:
+         *
+         * The thread is faster done than the hard interrupt handler
+         * on the other CPU. If we unmask the irq line then the
+         * interrupt can come in again and masks the line, leaves due
+         * to IRQ_INPROGRESS and the irq line is masked forever.
+         */
+        if (unlikely(desc->status & IRQ_INPROGRESS)) {
+                raw_spin_unlock_irq(&desc->lock);
+                chip_bus_sync_unlock(irq, desc);
+                cpu_relax();
+                goto again;
+        }
        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                desc->status &= ~IRQ_MASKED;
                desc->chip->unmask(irq);
        }
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
        chip_bus_sync_unlock(irq, desc);
 }
@@ -514,9 +536,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
                return;
        }
-        spin_lock_irq(&desc->lock);
+        raw_spin_lock_irq(&desc->lock);
        cpumask_copy(mask, desc->affinity);
-        spin_unlock_irq(&desc->lock);
+        raw_spin_unlock_irq(&desc->lock);
        set_cpus_allowed_ptr(current, mask);
        free_cpumask_var(mask);
@@ -545,7 +567,7 @@ static int irq_thread(void *data)
                atomic_inc(&desc->threads_active);
-                spin_lock_irq(&desc->lock);
+                raw_spin_lock_irq(&desc->lock);
                if (unlikely(desc->status & IRQ_DISABLED)) {
                        /*
                         * CHECKME: We might need a dedicated
@@ -555,9 +577,9 @@ static int irq_thread(void *data)
                         * retriggers the interrupt itself --- tglx
                         */
                        desc->status |= IRQ_PENDING;
-                        spin_unlock_irq(&desc->lock);
+                        raw_spin_unlock_irq(&desc->lock);
                } else {
-                        spin_unlock_irq(&desc->lock);
+                        raw_spin_unlock_irq(&desc->lock);
                        action->thread_fn(action->irq, action->dev_id);
@@ -679,7 +701,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        /*
         * The following block of code has to be executed atomically
         */
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        old_ptr = &desc->action;
        old = *old_ptr;
        if (old) {
@@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                if (new->flags & IRQF_ONESHOT)
                        desc->status |= IRQ_ONESHOT;
+                /*
+                 * Force MSI interrupts to run with interrupts
+                 * disabled. The multi vector cards can cause stack
+                 * overflows due to nested interrupts when enough of
+                 * them are directed to a core and fire at the same
+                 * time.
+                 */
+                if (desc->msi_desc)
+                        new->flags |= IRQF_DISABLED;
                if (!(desc->status & IRQ_NOAUTOEN)) {
                        desc->depth = 0;
                        desc->status &= ~IRQ_DISABLED;
@@ -775,7 +807,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                __enable_irq(desc, irq, false);
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        /*
         * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +834,7 @@ mismatch:
        ret = -EBUSY;
 out_thread:
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        if (new->thread) {
                struct task_struct *t = new->thread;
@@ -844,7 +876,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
        if (!desc)
                return NULL;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        /*
         * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +888,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                if (!action) {
                        WARN(1, "Trying to free already-free IRQ %d\n", irq);
-                        spin_unlock_irqrestore(&desc->lock, flags);
+                        raw_spin_unlock_irqrestore(&desc->lock, flags);
                        return NULL;
                }
@@ -884,7 +916,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                        desc->chip->disable(irq);
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        unregister_handler_proc(irq, action);
@@ -1067,7 +1099,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
                kfree(action);
 #ifdef CONFIG_DEBUG_SHIRQ
-        if (irqflags & IRQF_SHARED) {
+        if (!retval && (irqflags & IRQF_SHARED)) {
                /*
                 * It's a shared IRQ -- the driver ought to be prepared for it
                 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f2627..241962280836 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
        if (!desc->chip->set_affinity)
                return;
-        assert_spin_locked(&desc->lock);
+        assert_raw_spin_locked(&desc->lock);
        /*
         * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2e..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
 */
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
@@ -42,7 +43,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
                                "for migration.\n", irq);
                return false;
        }
-        spin_lock_init(&desc->lock);
+        raw_spin_lock_init(&desc->lock);
        desc->node = node;
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
        init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,10 +68,10 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        irq = old_desc->irq;
-        spin_lock_irqsave(&sparse_irq_lock, flags);
+        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc && old_desc != desc)
                goto out_unlock;
@@ -90,8 +91,8 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                goto out_unlock;
        }
-        irq_desc_ptrs[irq] = desc;
+        replace_irq_desc(irq, desc);
-        spin_unlock_irqrestore(&sparse_irq_lock, flags);
+        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        /* free the old one */
        free_one_irq_desc(old_desc, desc);
@@ -100,7 +101,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        return desc;
 out_unlock:
-        spin_unlock_irqrestore(&sparse_irq_lock, flags);
+        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        return desc;
 }
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e79867..0d4005d85b03 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
-                spin_lock_irqsave(&desc->lock, flags);
+                raw_spin_lock_irqsave(&desc->lock, flags);
                __disable_irq(desc, irq, true);
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
        for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
                if (!(desc->status & IRQ_SUSPENDED))
                        continue;
-                spin_lock_irqsave(&desc->lock, flags);
+                raw_spin_lock_irqsave(&desc->lock, flags);
                __enable_irq(desc, irq, true);
-                spin_unlock_irqrestore(&desc->lock, flags);
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 }
 EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 692363dd591f..7a6eb04ef6b5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/irq.h>
+#include <linux/gfp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
@@ -136,7 +137,7 @@ out:
 static int default_affinity_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, default_affinity_show, NULL);
+        return single_open(file, default_affinity_show, PDE(inode)->data);
 }
 static const struct file_operations default_affinity_proc_fops = {
@@ -148,18 +149,28 @@ static const struct file_operations default_affinity_proc_fops = {
 };
 #endif
-static int irq_spurious_read(char *page, char **start, off_t off,
+static int irq_spurious_proc_show(struct seq_file *m, void *v)
-                                  int count, int *eof, void *data)
 {
-        struct irq_desc *desc = irq_to_desc((long) data);
+        struct irq_desc *desc = irq_to_desc((long) m->private);
-        return sprintf(page, "count %u\n"
-                             "unhandled %u\n"
+        seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n",
-                             "last_unhandled %u ms\n",
+                   desc->irq_count, desc->irqs_unhandled,
-                        desc->irq_count,
+                   jiffies_to_msecs(desc->last_unhandled));
-                        desc->irqs_unhandled,
+        return 0;
-                        jiffies_to_msecs(desc->last_unhandled));
+}
+static int irq_spurious_proc_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, irq_spurious_proc_show, NULL);
 }
+static const struct file_operations irq_spurious_proc_fops = {
+        .open           = irq_spurious_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 #define MAX_NAMELEN 128
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -169,7 +180,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
        unsigned long flags;
        int ret = 1;
-        spin_lock_irqsave(&desc->lock, flags);
+        raw_spin_lock_irqsave(&desc->lock, flags);
        for (action = desc->action ; action; action = action->next) {
                if ((action != new_action) && action->name &&
                                !strcmp(new_action->name, action->name)) {
@@ -177,7 +188,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
                        break;
                }
        }
-        spin_unlock_irqrestore(&desc->lock, flags);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
 }
@@ -204,7 +215,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
        char name [MAX_NAMELEN];
-        struct proc_dir_entry *entry;
        if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
                return;
@@ -214,6 +224,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
        /* create /proc/irq/1234 */
        desc->dir = proc_mkdir(name, root_irq_dir);
+        if (!desc->dir)
+                return;
 #ifdef CONFIG_SMP
        /* create /proc/irq/<irq>/smp_affinity */
@@ -221,11 +233,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
                         &irq_affinity_proc_fops, (void *)(long)irq);
 #endif
-        entry = create_proc_entry("spurious", 0444, desc->dir);
+        proc_create_data("spurious", 0444, desc->dir,
-        if (entry) {
+                         &irq_spurious_proc_fops, (void *)(long)irq);
-                entry->data = (void *)(long)irq;
-                entry->read_proc = irq_spurious_read;
-        }
 }
 #undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index bd7273e6282e..89fb90ae534f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
        struct irqaction *action;
        int ok = 0, work = 0;
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        /* Already running on another processor */
        if (desc->status & IRQ_INPROGRESS) {
                /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
                 */
                if (desc->action && (desc->action->flags & IRQF_SHARED))
                        desc->status |= IRQ_PENDING;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                return ok;
        }
        /* Honour the normal IRQ locking */
        desc->status |= IRQ_INPROGRESS;
        action = desc->action;
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        while (action) {
                /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
        }
        local_irq_disable();
        /* Now clean up the flags */
-        spin_lock(&desc->lock);
+        raw_spin_lock(&desc->lock);
        action = desc->action;
        /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
                 * Perform real IRQ processing for the IRQ we deferred
                 */
                work = 1;
-                spin_unlock(&desc->lock);
+                raw_spin_unlock(&desc->lock);
                handle_IRQ_event(irq, action);
-                spin_lock(&desc->lock);
+                raw_spin_lock(&desc->lock);
                desc->status &= ~IRQ_PENDING;
        }
        desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
         */
        if (work && desc->chip && desc->chip->end)
                desc->chip->end(irq);
-        spin_unlock(&desc->lock);
+        raw_spin_unlock(&desc->lock);
        return ok;
 }
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
        return ok;
 }
-static void poll_all_shared_irqs(void)
+static void poll_spurious_irqs(unsigned long dummy)
 {
        struct irq_desc *desc;
        int i;
@@ -125,23 +125,11 @@ static void poll_all_shared_irqs(void)
                try_one_irq(i, desc);
                local_irq_enable();
        }
-}
-static void poll_spurious_irqs(unsigned long dummy)
-{
-        poll_all_shared_irqs();
        mod_timer(&poll_spurious_irq_timer,
                  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
-#ifdef CONFIG_DEBUG_SHIRQ
-void debug_poll_all_shared_irqs(void)
-{
-        poll_all_shared_irqs();
-}
-#endif
 /*
 * If 99,900 of the previous 100,000 interrupts have not been handled
 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -232,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
-                 * otherwise the couter becomes a doomsday timer for otherwise
+                 * otherwise the counter becomes a doomsday timer for otherwise
                 * working systems
                 */
                if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 {
        cputime_t cval, nval, cinterval, ninterval;
        s64 ns_ninterval, ns_nval;
+        u32 error, incr_error;
        struct cpu_itimer *it = &tsk->signal->it[clock_id];
        nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        ninterval = timeval_to_cputime(&value->it_interval);
        ns_ninterval = timeval_to_ns(&value->it_interval);
-        it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+        error = cputime_sub_ns(nval, ns_nval);
-        it->error = cputime_sub_ns(nval, ns_nval);
+        incr_error = cputime_sub_ns(ninterval, ns_ninterval);
        spin_lock_irq(&tsk->sighand->siglock);
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        }
        it->expires = nval;
        it->incr = ninterval;
+        it->error = error;
+        it->incr_error = incr_error;
        trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
                           ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..13aff293f4de 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>        /* for cond_resched */
 #include <linux/mm.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <asm/sections.h>
@@ -181,6 +182,7 @@ unsigned long kallsyms_lookup_name(const char *name)
        }
        return module_kallsyms_lookup_name(name);
 }
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
                                      unsigned long),
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f98..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,7 @@
 #include <linux/hardirq.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
-#include <linux/utsrelease.h>
+#include <generated/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
 #include <linux/suspend.h>
@@ -31,6 +31,8 @@
 #include <linux/cpu.h>
 #include <linux/console.h>
 #include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/kmsg_dump.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -39,7 +41,7 @@
 #include <asm/sections.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t* crash_notes;
+note_buf_t __percpu *crash_notes;
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -1073,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
        if (mutex_trylock(&kexec_mutex)) {
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
+                        kmsg_dump(KMSG_DUMP_KEXEC);
                        crash_setup_regs(&fixed_regs, regs);
                        crash_save_vmcoreinfo();
                        machine_crash_shutdown(&fixed_regs);
@@ -1082,6 +1087,62 @@ void crash_kexec(struct pt_regs *regs)
        }
 }
+size_t crash_get_memory_size(void)
+{
+        size_t size;
+        mutex_lock(&kexec_mutex);
+        size = crashk_res.end - crashk_res.start + 1;
+        mutex_unlock(&kexec_mutex);
+        return size;
+}
+static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+{
+        unsigned long addr;
+        for (addr = begin; addr < end; addr += PAGE_SIZE) {
+                ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
+                init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+                free_page((unsigned long)__va(addr));
+                totalram_pages++;
+        }
+}
+int crash_shrink_memory(unsigned long new_size)
+{
+        int ret = 0;
+        unsigned long start, end;
+        mutex_lock(&kexec_mutex);
+        if (kexec_crash_image) {
+                ret = -ENOENT;
+                goto unlock;
+        }
+        start = crashk_res.start;
+        end = crashk_res.end;
+        if (new_size >= end - start + 1) {
+                ret = -EINVAL;
+                if (new_size == end - start + 1)
+                        ret = 0;
+                goto unlock;
+        }
+        start = roundup(start, PAGE_SIZE);
+        end = roundup(start + new_size, PAGE_SIZE);
+        free_reserved_phys_range(end, crashk_res.end);
+        if (start == end)
+                release_resource(&crashk_res);
+        crashk_res.end = end - 1;
+unlock:
+        mutex_unlock(&kexec_mutex);
+        return ret;
+}
 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
                            size_t data_len)
 {
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bbe..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
 /*
- * A simple kernel FIFO implementation.
+ * A generic kernel FIFO implementation.
 *
+ * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
 *
 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
 #include <linux/err.h>
 #include <linux/kfifo.h>
 #include <linux/log2.h>
+#include <linux/uaccess.h>
+static void _kfifo_init(struct kfifo *fifo, void *buffer,
+                unsigned int size)
+{
+        fifo->buffer = buffer;
+        fifo->size = size;
+        kfifo_reset(fifo);
+}
 /**
- * kfifo_init - allocates a new FIFO using a preallocated buffer
+ * kfifo_init - initialize a FIFO using a preallocated buffer
+ * @fifo: the fifo to assign the buffer
 * @buffer: the preallocated buffer to be used.
- * @size: the size of the internal buffer, this have to be a power of 2.
+ * @size: the size of the internal buffer, this has to be a power of 2.
- * @gfp_mask: get_free_pages mask, passed to kmalloc()
- * @lock: the lock to be used to protect the fifo buffer
 *
- * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
- * &struct kfifo with kfree().
 */
-struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
+void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
-                         gfp_t gfp_mask, spinlock_t *lock)
 {
-        struct kfifo *fifo;
        /* size must be a power of 2 */
        BUG_ON(!is_power_of_2(size));
-        fifo = kmalloc(sizeof(struct kfifo), gfp_mask);
+        _kfifo_init(fifo, buffer, size);
-        if (!fifo)
-                return ERR_PTR(-ENOMEM);
-        fifo->buffer = buffer;
-        fifo->size = size;
-        fifo->in = fifo->out = 0;
-        fifo->lock = lock;
-        return fifo;
 }
 EXPORT_SYMBOL(kfifo_init);
 /**
- * kfifo_alloc - allocates a new FIFO and its internal buffer
+ * kfifo_alloc - allocates a new FIFO internal buffer
- * @size: the size of the internal buffer to be allocated.
+ * @fifo: the fifo to assign then new buffer
+ * @size: the size of the buffer to be allocated, this have to be a power of 2.
 * @gfp_mask: get_free_pages mask, passed to kmalloc()
- * @lock: the lock to be used to protect the fifo buffer
+ *
+ * This function dynamically allocates a new fifo internal buffer
 *
 * The size will be rounded-up to a power of 2.
+ * The buffer will be release with kfifo_free().
+ * Return 0 if no error, otherwise the an error code
 */
-struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
+int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
 {
        unsigned char *buffer;
-        struct kfifo *ret;
        /*
         * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
        }
        buffer = kmalloc(size, gfp_mask);
-        if (!buffer)
+        if (!buffer) {
-                return ERR_PTR(-ENOMEM);
+                _kfifo_init(fifo, NULL, 0);
+                return -ENOMEM;
-        ret = kfifo_init(buffer, size, gfp_mask, lock);
+        }
-        if (IS_ERR(ret))
+        _kfifo_init(fifo, buffer, size);
-                kfree(buffer);
-        return ret;
+        return 0;
 }
 EXPORT_SYMBOL(kfifo_alloc);
 /**
- * kfifo_free - frees the FIFO
+ * kfifo_free - frees the FIFO internal buffer
 * @fifo: the fifo to be freed.
 */
 void kfifo_free(struct kfifo *fifo)
 {
        kfree(fifo->buffer);
-        kfree(fifo);
+        _kfifo_init(fifo, NULL, 0);
 }
 EXPORT_SYMBOL(kfifo_free);
 /**
- * __kfifo_put - puts some data into the FIFO, no locking version
+ * kfifo_skip - skip output data
 * @fifo: the fifo to be used.
- * @buffer: the data to be added.
+ * @len: number of bytes to skip
- * @len: the length of the data to be added.
- *
- * This function copies at most @len bytes from the @buffer into
- * the FIFO depending on the free space, and returns the number of
- * bytes copied.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
 */
-unsigned int __kfifo_put(struct kfifo *fifo,
+void kfifo_skip(struct kfifo *fifo, unsigned int len)
-                        const unsigned char *buffer, unsigned int len)
+{
+        if (len < kfifo_len(fifo)) {
+                __kfifo_add_out(fifo, len);
+                return;
+        }
+        kfifo_reset_out(fifo);
+}
+EXPORT_SYMBOL(kfifo_skip);
+static inline void __kfifo_in_data(struct kfifo *fifo,
+                const void *from, unsigned int len, unsigned int off)
 {
        unsigned int l;
-        len = min(len, fifo->size - fifo->in + fifo->out);
+        /*
+         * Ensure that we sample the fifo->out index -before- we
+         * start putting bytes into the kfifo.
+         */
+        smp_mb();
+        off = __kfifo_off(fifo, fifo->in + off);
+        /* first put the data starting from fifo->in to buffer end */
+        l = min(len, fifo->size - off);
+        memcpy(fifo->buffer + off, from, l);
+        /* then put the rest (if any) at the beginning of the buffer */
+        memcpy(fifo->buffer, from + l, len - l);
+}
+static inline void __kfifo_out_data(struct kfifo *fifo,
+                void *to, unsigned int len, unsigned int off)
+{
+        unsigned int l;
+        /*
+         * Ensure that we sample the fifo->in index -before- we
+         * start removing bytes from the kfifo.
+         */
+        smp_rmb();
+        off = __kfifo_off(fifo, fifo->out + off);
+        /* first get the data from fifo->out until the end of the buffer */
+        l = min(len, fifo->size - off);
+        memcpy(to, fifo->buffer + off, l);
+        /* then get the rest (if any) from the beginning of the buffer */
+        memcpy(to + l, fifo->buffer, len - l);
+}
+static inline int __kfifo_from_user_data(struct kfifo *fifo,
+         const void __user *from, unsigned int len, unsigned int off,
+         unsigned *lenout)
+{
+        unsigned int l;
+        int ret;
        /*
         * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo,
        smp_mb();
+        off = __kfifo_off(fifo, fifo->in + off);
        /* first put the data starting from fifo->in to buffer end */
-        l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
+        l = min(len, fifo->size - off);
-        memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
+        ret = copy_from_user(fifo->buffer + off, from, l);
+        if (unlikely(ret)) {
+                *lenout = ret;
+                return -EFAULT;
+        }
+        *lenout = l;
        /* then put the rest (if any) at the beginning of the buffer */
-        memcpy(fifo->buffer, buffer + l, len - l);
+        ret = copy_from_user(fifo->buffer, from + l, len - l);
+        *lenout += ret ? ret : len - l;
+        return ret ? -EFAULT : 0;
+}
+static inline int __kfifo_to_user_data(struct kfifo *fifo,
+                void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
+{
+        unsigned int l;
+        int ret;
        /*
-         * Ensure that we add the bytes to the kfifo -before-
+         * Ensure that we sample the fifo->in index -before- we
-         * we update the fifo->in index.
+         * start removing bytes from the kfifo.
         */
-        smp_wmb();
+        smp_rmb();
+        off = __kfifo_off(fifo, fifo->out + off);
+        /* first get the data from fifo->out until the end of the buffer */
+        l = min(len, fifo->size - off);
+        ret = copy_to_user(to, fifo->buffer + off, l);
+        *lenout = l;
+        if (unlikely(ret)) {
+                *lenout -= ret;
+                return -EFAULT;
+        }
+        /* then get the rest (if any) from the beginning of the buffer */
+        len -= l;
+        ret = copy_to_user(to + l, fifo->buffer, len);
+        if (unlikely(ret)) {
+                *lenout += len - ret;
+                return -EFAULT;
+        }
+        *lenout += len;
+        return 0;
+}
+unsigned int __kfifo_in_n(struct kfifo *fifo,
+        const void *from, unsigned int len, unsigned int recsize)
+{
+        if (kfifo_avail(fifo) < len + recsize)
+                return len + 1;
+        __kfifo_in_data(fifo, from, len, recsize);
+        return 0;
+}
+EXPORT_SYMBOL(__kfifo_in_n);
-        fifo->in += len;
+/**
+ * kfifo_in - puts some data into the FIFO
+ * @fifo: the fifo to be used.
+ * @from: the data to be added.
+ * @len: the length of the data to be added.
+ *
+ * This function copies at most @len bytes from the @from buffer into
+ * the FIFO depending on the free space, and returns the number of
+ * bytes copied.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+unsigned int kfifo_in(struct kfifo *fifo, const void *from,
+                                unsigned int len)
+{
+        len = min(kfifo_avail(fifo), len);
+        __kfifo_in_data(fifo, from, len, 0);
+        __kfifo_add_in(fifo, len);
        return len;
 }
-EXPORT_SYMBOL(__kfifo_put);
+EXPORT_SYMBOL(kfifo_in);
+unsigned int __kfifo_in_generic(struct kfifo *fifo,
+        const void *from, unsigned int len, unsigned int recsize)
+{
+        return __kfifo_in_rec(fifo, from, len, recsize);
+}
+EXPORT_SYMBOL(__kfifo_in_generic);
+unsigned int __kfifo_out_n(struct kfifo *fifo,
+        void *to, unsigned int len, unsigned int recsize)
+{
+        if (kfifo_len(fifo) < len + recsize)
+                return len;
+        __kfifo_out_data(fifo, to, len, recsize);
+        __kfifo_add_out(fifo, len + recsize);
+        return 0;
+}
+EXPORT_SYMBOL(__kfifo_out_n);
 /**
- * __kfifo_get - gets some data from the FIFO, no locking version
+ * kfifo_out - gets some data from the FIFO
 * @fifo: the fifo to be used.
- * @buffer: where the data must be copied.
+ * @to: where the data must be copied.
 * @len: the size of the destination buffer.
 *
 * This function copies at most @len bytes from the FIFO into the
- * @buffer and returns the number of copied bytes.
+ * @to buffer and returns the number of copied bytes.
 *
 * Note that with only one concurrent reader and one concurrent
 * writer, you don't need extra locking to use these functions.
 */
-unsigned int __kfifo_get(struct kfifo *fifo,
+unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
-                         unsigned char *buffer, unsigned int len)
 {
-        unsigned int l;
+        len = min(kfifo_len(fifo), len);
-        len = min(len, fifo->in - fifo->out);
+        __kfifo_out_data(fifo, to, len, 0);
+        __kfifo_add_out(fifo, len);
-        /*
+        return len;
-         * Ensure that we sample the fifo->in index -before- we
+}
-         * start removing bytes from the kfifo.
+EXPORT_SYMBOL(kfifo_out);
-         */
-        smp_rmb();
+/**
+ * kfifo_out_peek - copy some data from the FIFO, but do not remove it
+ * @fifo: the fifo to be used.
+ * @to: where the data must be copied.
+ * @len: the size of the destination buffer.
+ * @offset: offset into the fifo
+ *
+ * This function copies at most @len bytes at @offset from the FIFO
+ * into the @to buffer and returns the number of copied bytes.
+ * The data is not removed from the FIFO.
+ */
+unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
+                            unsigned offset)
+{
+        len = min(kfifo_len(fifo), len + offset);
-        /* first get the data from fifo->out until the end of the buffer */
+        __kfifo_out_data(fifo, to, len, offset);
-        l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
+        return len;
-        memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
+}
+EXPORT_SYMBOL(kfifo_out_peek);
-        /* then get the rest (if any) from the beginning of the buffer */
+unsigned int __kfifo_out_generic(struct kfifo *fifo,
-        memcpy(buffer + l, fifo->buffer, len - l);
+        void *to, unsigned int len, unsigned int recsize,
+        unsigned int *total)
+{
+        return __kfifo_out_rec(fifo, to, len, recsize, total);
+}
+EXPORT_SYMBOL(__kfifo_out_generic);
-        /*
+unsigned int __kfifo_from_user_n(struct kfifo *fifo,
-         * Ensure that we remove the bytes from the kfifo -before-
+        const void __user *from, unsigned int len, unsigned int recsize)
-         * we update the fifo->out index.
+{
-         */
+        unsigned total;
-        smp_mb();
+        if (kfifo_avail(fifo) < len + recsize)
+                return len + 1;
-        fifo->out += len;
+        __kfifo_from_user_data(fifo, from, len, recsize, &total);
+        return total;
+}
+EXPORT_SYMBOL(__kfifo_from_user_n);
-        return len;
+/**
+ * kfifo_from_user - puts some data from user space into the FIFO
+ * @fifo: the fifo to be used.
+ * @from: pointer to the data to be added.
+ * @len: the length of the data to be added.
+ * @total: the actual returned data length.
+ *
+ * This function copies at most @len bytes from the @from into the
+ * FIFO depending and returns -EFAULT/0.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+int kfifo_from_user(struct kfifo *fifo,
+        const void __user *from, unsigned int len, unsigned *total)
+{
+        int ret;
+        len = min(kfifo_avail(fifo), len);
+        ret = __kfifo_from_user_data(fifo, from, len, 0, total);
+        if (ret)
+                return ret;
+        __kfifo_add_in(fifo, len);
+        return 0;
 }
-EXPORT_SYMBOL(__kfifo_get);
+EXPORT_SYMBOL(kfifo_from_user);
+unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
+        const void __user *from, unsigned int len, unsigned int recsize)
+{
+        return __kfifo_from_user_rec(fifo, from, len, recsize);
+}
+EXPORT_SYMBOL(__kfifo_from_user_generic);
+unsigned int __kfifo_to_user_n(struct kfifo *fifo,
+        void __user *to, unsigned int len, unsigned int reclen,
+        unsigned int recsize)
+{
+        unsigned int ret, total;
+        if (kfifo_len(fifo) < reclen + recsize)
+                return len;
+        ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
+        if (likely(ret == 0))
+                __kfifo_add_out(fifo, reclen + recsize);
+        return total;
+}
+EXPORT_SYMBOL(__kfifo_to_user_n);
+/**
+ * kfifo_to_user - gets data from the FIFO and write it to user space
+ * @fifo: the fifo to be used.
+ * @to: where the data must be copied.
+ * @len: the size of the destination buffer.
+ * @lenout: pointer to output variable with copied data
+ *
+ * This function copies at most @len bytes from the FIFO into the
+ * @to buffer and 0 or -EFAULT.
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these functions.
+ */
+int kfifo_to_user(struct kfifo *fifo,
+        void __user *to, unsigned int len, unsigned *lenout)
+{
+        int ret;
+        len = min(kfifo_len(fifo), len);
+        ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
+        __kfifo_add_out(fifo, *lenout);
+        return ret;
+}
+EXPORT_SYMBOL(kfifo_to_user);
+unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
+        void __user *to, unsigned int len, unsigned int recsize,
+        unsigned int *total)
+{
+        return __kfifo_to_user_rec(fifo, to, len, recsize, total);
+}
+EXPORT_SYMBOL(__kfifo_to_user_generic);
+unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
+{
+        if (recsize == 0)
+                return kfifo_avail(fifo);
+        return __kfifo_peek_n(fifo, recsize);
+}
+EXPORT_SYMBOL(__kfifo_peek_generic);
+void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
+{
+        __kfifo_skip_rec(fifo, recsize);
+}
+EXPORT_SYMBOL(__kfifo_skip_generic);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..11f3515ca83f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -69,9 +69,16 @@ struct kgdb_state {
        struct pt_regs          *linux_regs;
 };
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP       0x8 /* CPU is single stepping */
 static struct debuggerinfo_struct {
        void                    *debuggerinfo;
        struct task_struct      *task;
+        int                     exception_state;
 } kgdb_info[NR_CPUS];
 /**
@@ -129,6 +136,7 @@ struct task_struct		*kgdb_usethread;
 struct task_struct              *kgdb_contthread;
 int                             kgdb_single_step;
+pid_t                           kgdb_sstep_pid;
 /* Our I/O buffers. */
 static char                     remcom_in_buffer[BUFMAX];
@@ -390,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
 /*
 * Copy the binary array pointed to by buf into mem.  Fix $, #, and
- * 0x7d escaped with 0x7d.  Return a pointer to the character after
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * the last byte written.
+ * The input buf is overwitten with the result to write to mem.
 */
 static int kgdb_ebin2mem(char *buf, char *mem, int count)
 {
-        int err = 0;
+        int size = 0;
-        char c;
+        char *c = buf;
        while (count-- > 0) {
-                c = *buf++;
+                c[size] = *buf++;
-                if (c == 0x7d)
+                if (c[size] == 0x7d)
-                        c = *buf++ ^ 0x20;
+                        c[size] = *buf++ ^ 0x20;
+                size++;
-                err = probe_kernel_write(mem, &c, 1);
-                if (err)
-                        break;
-                mem++;
        }
-        return err;
+        return probe_kernel_write(mem, c, size);
 }
 /*
@@ -541,12 +544,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
         */
        if (tid == 0 || tid == -1)
                tid = -atomic_read(&kgdb_active) - 2;
-        if (tid < 0) {
+        if (tid < -1 && tid > -NR_CPUS - 2) {
                if (kgdb_info[-tid - 2].task)
                        return kgdb_info[-tid - 2].task;
                else
                        return idle_task(-tid - 2);
        }
+        if (tid <= 0) {
+                printk(KERN_ERR "KGDB: Internal thread select error\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -557,46 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
 }
 /*
- * CPU debug state control:
- */
-#ifdef CONFIG_SMP
-static void kgdb_wait(struct pt_regs *regs)
-{
-        unsigned long flags;
-        int cpu;
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        kgdb_info[cpu].debuggerinfo = regs;
-        kgdb_info[cpu].task = current;
-        /*
-         * Make sure the above info reaches the primary CPU before
-         * our cpu_in_kgdb[] flag setting does:
-         */
-        smp_wmb();
-        atomic_set(&cpu_in_kgdb[cpu], 1);
-        /* Wait till primary CPU is done with debugging */
-        while (atomic_read(&passive_cpu_wait[cpu]))
-                cpu_relax();
-        kgdb_info[cpu].debuggerinfo = NULL;
-        kgdb_info[cpu].task = NULL;
-        /* fix up hardware debug registers on local cpu */
-        if (arch_kgdb_ops.correct_hw_break)
-                arch_kgdb_ops.correct_hw_break();
-        /* Signal the primary CPU that we are done: */
-        atomic_set(&cpu_in_kgdb[cpu], 0);
-        touch_softlockup_watchdog();
-        clocksource_touch_watchdog();
-        local_irq_restore(flags);
-}
-#endif
-/*
 * Some architectures need cache flushes when we set/clear a
 * breakpoint:
 */
@@ -619,7 +587,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 static int kgdb_activate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +598,16 @@ static int kgdb_activate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_set_breakpoint(addr,
                                kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        ret = error;
+                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+                        continue;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_ACTIVE;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +654,8 @@ static int kgdb_set_sw_break(unsigned long addr)
 static int kgdb_deactivate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +664,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_remove_breakpoint(addr,
                                        kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+                        ret = error;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_SET;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_remove_sw_break(unsigned long addr)
@@ -870,7 +845,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
        /*
         * All threads that don't have debuggerinfo should be
-         * in __schedule() sleeping, since all other CPUs
+         * in schedule() sleeping, since all other CPUs
         * are in kgdb_wait, and thus have debuggerinfo.
         */
        if (local_debuggerinfo) {
@@ -1204,8 +1179,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
                return 1;
        } else {
-                error_packet(remcom_out_buffer, -EINVAL);
+                kgdb_msg_write("KGDB only knows signal 9 (pass)"
-                return 0;
+                        " and 15 (pass and disconnect)\n"
+                        "Executing a continue without signal passing\n", 0);
+                remcom_in_buffer[0] = 'c';
        }
        /* Indicate fall through */
@@ -1382,33 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
        return 1;
 }
-/*
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
- * kgdb_handle_exception() - main entry point from a kernel exception
- *
- * Locking hierarchy:
- *      interface locks, if any (begin_session)
- *      kgdb lock (kgdb_active)
- */
-int
-kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 {
-        struct kgdb_state kgdb_var;
-        struct kgdb_state *ks = &kgdb_var;
        unsigned long flags;
+        int sstep_tries = 100;
        int error = 0;
        int i, cpu;
+        int trace_on = 0;
-        ks->cpu                 = raw_smp_processor_id();
-        ks->ex_vector           = evector;
-        ks->signo               = signo;
-        ks->ex_vector           = evector;
-        ks->err_code            = ecode;
-        ks->kgdb_usethreadid    = 0;
-        ks->linux_regs          = regs;
-        if (kgdb_reenter_check(ks))
-                return 0; /* Ouch, double exception ! */
 acquirelock:
        /*
         * Interrupts will be restored by the 'trap return' code, except when
@@ -1416,24 +1373,55 @@ acquirelock:
         */
        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
+        cpu = ks->cpu;
+        kgdb_info[cpu].debuggerinfo = regs;
+        kgdb_info[cpu].task = current;
+        /*
+         * Make sure the above info reaches the primary CPU before
+         * our cpu_in_kgdb[] flag setting does:
+         */
+        atomic_inc(&cpu_in_kgdb[cpu]);
        /*
-         * Acquire the kgdb_active lock:
+         * CPU will loop if it is a slave or request to become a kgdb
+         * master cpu and acquire the kgdb_active lock:
         */
-        while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
+        while (1) {
+                if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+                        if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+                                break;
+                } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+                        if (!atomic_read(&passive_cpu_wait[cpu]))
+                                goto return_normal;
+                } else {
+return_normal:
+                        /* Return to normal operation by executing any
+                         * hw breakpoint fixup.
+                         */
+                        if (arch_kgdb_ops.correct_hw_break)
+                                arch_kgdb_ops.correct_hw_break();
+                        if (trace_on)
+                                tracing_on();
+                        atomic_dec(&cpu_in_kgdb[cpu]);
+                        touch_softlockup_watchdog_sync();
+                        clocksource_touch_watchdog();
+                        local_irq_restore(flags);
+                        return 0;
+                }
                cpu_relax();
+        }
        /*
-         * Do not start the debugger connection on this CPU if the last
+         * For single stepping, try to only enter on the processor
-         * instance of the exception handler wanted to come into the
+         * that was single stepping.  To gaurd against a deadlock, the
-         * debugger on a different CPU via a single step
+         * kernel will only try for the value of sstep_tries before
+         * giving up and continuing on.
         */
        if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-            atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+            (kgdb_info[cpu].task &&
+             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
-                touch_softlockup_watchdog();
+                touch_softlockup_watchdog_sync();
                clocksource_touch_watchdog();
                local_irq_restore(flags);
@@ -1455,9 +1443,6 @@ acquirelock:
        if (kgdb_io_ops->pre_exception)
                kgdb_io_ops->pre_exception();
-        kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
-        kgdb_info[ks->cpu].task = current;
        kgdb_disable_hw_debug(ks->linux_regs);
        /*
@@ -1466,15 +1451,9 @@ acquirelock:
         */
        if (!kgdb_single_step) {
                for (i = 0; i < NR_CPUS; i++)
-                        atomic_set(&passive_cpu_wait[i], 1);
+                        atomic_inc(&passive_cpu_wait[i]);
        }
-        /*
-         * spin_lock code is good enough as a barrier so we don't
-         * need one here:
-         */
-        atomic_set(&cpu_in_kgdb[ks->cpu], 1);
 #ifdef CONFIG_SMP
        /* Signal the other CPUs to enter kgdb_wait() */
        if ((!kgdb_single_step) && kgdb_do_roundup)
@@ -1498,6 +1477,9 @@ acquirelock:
        kgdb_single_step = 0;
        kgdb_contthread = current;
        exception_level = 0;
+        trace_on = tracing_is_on();
+        if (trace_on)
+                tracing_off();
        /* Talk to debugger with gdbserial protocol */
        error = gdb_serial_stub(ks);
@@ -1506,13 +1488,11 @@ acquirelock:
        if (kgdb_io_ops->post_exception)
                kgdb_io_ops->post_exception();
-        kgdb_info[ks->cpu].debuggerinfo = NULL;
+        atomic_dec(&cpu_in_kgdb[ks->cpu]);
-        kgdb_info[ks->cpu].task = NULL;
-        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
        if (!kgdb_single_step) {
                for (i = NR_CPUS-1; i >= 0; i--)
-                        atomic_set(&passive_cpu_wait[i], 0);
+                        atomic_dec(&passive_cpu_wait[i]);
                /*
                 * Wait till all the CPUs have quit
                 * from the debugger.
@@ -1524,22 +1504,70 @@ acquirelock:
        }
 kgdb_restore:
+        if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+                int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+                if (kgdb_info[sstep_cpu].task)
+                        kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+                else
+                        kgdb_sstep_pid = 0;
+        }
+        if (trace_on)
+                tracing_on();
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
-        touch_softlockup_watchdog();
+        touch_softlockup_watchdog_sync();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
        return error;
 }
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ *      interface locks, if any (begin_session)
+ *      kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        int ret;
+        ks->cpu                 = raw_smp_processor_id();
+        ks->ex_vector           = evector;
+        ks->signo               = signo;
+        ks->ex_vector           = evector;
+        ks->err_code            = ecode;
+        ks->kgdb_usethreadid    = 0;
+        ks->linux_regs          = regs;
+        if (kgdb_reenter_check(ks))
+                return 0; /* Ouch, double exception ! */
+        kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+        ret = kgdb_cpu_enter(ks, regs);
+        kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
+        return ret;
+}
 int kgdb_nmicallback(int cpu, void *regs)
 {
 #ifdef CONFIG_SMP
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        memset(ks, 0, sizeof(struct kgdb_state));
+        ks->cpu                 = cpu;
+        ks->linux_regs          = regs;
        if (!atomic_read(&cpu_in_kgdb[cpu]) &&
-                        atomic_read(&kgdb_active) != cpu &&
+            atomic_read(&kgdb_active) != -1 &&
-                        atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
+            atomic_read(&kgdb_active) != cpu) {
-                kgdb_wait((struct pt_regs *)regs);
+                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+                kgdb_cpu_enter(ks, regs);
+                kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
                return 0;
        }
 #endif
@@ -1715,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 */
 void kgdb_breakpoint(void)
 {
-        atomic_set(&kgdb_setting_breakpoint, 1);
+        atomic_inc(&kgdb_setting_breakpoint);
        wmb(); /* Sync point before breakpoint */
        arch_kgdb_breakpoint();
        wmb(); /* Sync point after breakpoint */
-        atomic_set(&kgdb_setting_breakpoint, 0);
+        atomic_dec(&kgdb_setting_breakpoint);
 }
 EXPORT_SYMBOL_GPL(kgdb_breakpoint);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a11f87..bf0e231d9702 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50  /* Completely arbitrary value - KAO */
        static int kmod_loop_msg;
-        ret = security_kernel_module_request();
-        if (ret)
-                return ret;
        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
        if (ret >= MODULE_NAME_LEN)
                return -ENAMETOOLONG;
+        ret = security_kernel_module_request(module_name);
+        if (ret)
+                return ret;
        /* If modprobe needs a service that is in a module, we get a recursive
         * loop.  Limit the number of running kmod threads to max_threads/2 or
         * MAX_KMOD_CONCURRENT, whichever is the smaller.  A cleaner method
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
                return -ENOMEM;
        ret = call_usermodehelper_stdinpipe(sub_info, filp);
-        if (ret < 0)
+        if (ret < 0) {
-                goto out;
+                call_usermodehelper_freeinfo(sub_info);
+                return ret;
+        }
-        return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+        ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+        if (ret < 0)    /* Failed to execute helper, close pipe */
+                filp_close(*filp, NULL);
-  out:
-        call_usermodehelper_freeinfo(sub_info);
        return ret;
 }
 EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75f4c60..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/sysctl.h>
 #include <linux/kdebug.h>
 #include <linux/memory.h>
+#include <linux/ftrace.h>
+#include <linux/cpu.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -90,6 +93,10 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 */
 static struct kprobe_blackpoint kprobe_blacklist[] = {
        {"preempt_schedule",},
+        {"native_get_debugreg",},
+        {"irq_entries_start",},
+        {"common_interrupt",},
+        {"mcount",},    /* mcount can be called from everywhere */
        {NULL}    /* Terminator */
 };
@@ -100,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
-#define INSNS_PER_PAGE  (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
 struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;         /* Page of instruction slots */
-        char slot_used[INSNS_PER_PAGE];
        int nused;
        int ngarbage;
+        char slot_used[];
+};
+#define KPROBE_INSN_PAGE_SIZE(slots)                    \
+        (offsetof(struct kprobe_insn_page, slot_used) + \
+         (sizeof(char) * (slots)))
+struct kprobe_insn_cache {
+        struct list_head pages; /* list of kprobe_insn_page */
+        size_t insn_size;       /* size of instruction slot */
+        int nr_garbage;
 };
+static int slots_per_page(struct kprobe_insn_cache *c)
+{
+        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
+}
 enum kprobe_slot_state {
        SLOT_CLEAN = 0,
        SLOT_DIRTY = 1,
        SLOT_USED = 2,
 };
-static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
+static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
-static LIST_HEAD(kprobe_insn_pages);
+static struct kprobe_insn_cache kprobe_insn_slots = {
-static int kprobe_garbage_slots;
+        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
-static int collect_garbage_slots(void);
+        .insn_size = MAX_INSN_SIZE,
+        .nr_garbage = 0,
-static int __kprobes check_safety(void)
+};
-{
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
-        int ret = 0;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
-        ret = freeze_processes();
-        if (ret == 0) {
-                struct task_struct *p, *q;
-                do_each_thread(p, q) {
-                        if (p != current && p->state == TASK_RUNNING &&
-                            p->pid != 0) {
-                                printk("Check failed: %s is running\n",p->comm);
-                                ret = -1;
-                                goto loop_end;
-                        }
-                } while_each_thread(p, q);
-        }
-loop_end:
-        thaw_processes();
-#else
-        synchronize_sched();
-#endif
-        return ret;
-}
 /**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-static kprobe_opcode_t __kprobes *__get_insn_slot(void)
+static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip;
 retry:
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+        list_for_each_entry(kip, &c->pages, list) {
-                if (kip->nused < INSNS_PER_PAGE) {
+                if (kip->nused < slots_per_page(c)) {
                        int i;
-                        for (i = 0; i < INSNS_PER_PAGE; i++) {
+                        for (i = 0; i < slots_per_page(c); i++) {
                                if (kip->slot_used[i] == SLOT_CLEAN) {
                                        kip->slot_used[i] = SLOT_USED;
                                        kip->nused++;
-                                        return kip->insns + (i * MAX_INSN_SIZE);
+                                        return kip->insns + (i * c->insn_size);
                                }
                        }
-                        /* Surprise!  No unused slots.  Fix kip->nused. */
+                        /* kip->nused is broken. Fix it. */
-                        kip->nused = INSNS_PER_PAGE;
+                        kip->nused = slots_per_page(c);
+                        WARN_ON(1);
                }
        }
        /* If there are any garbage slots, collect it and try again. */
-        if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+        if (c->nr_garbage && collect_garbage_slots(c) == 0)
                goto retry;
-        }
-        /* All out of space.  Need to allocate a new page. Use slot 0. */
+        /* All out of space.  Need to allocate a new page. */
-        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+        kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
        if (!kip)
                return NULL;
@@ -189,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
                return NULL;
        }
        INIT_LIST_HEAD(&kip->list);
-        list_add(&kip->list, &kprobe_insn_pages);
+        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
-        memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
+        list_add(&kip->list, &c->pages);
        return kip->insns;
 }
 kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
-        kprobe_opcode_t *ret;
+        kprobe_opcode_t *ret = NULL;
        mutex_lock(&kprobe_insn_mutex);
-        ret = __get_insn_slot();
+        ret = __get_insn_slot(&kprobe_insn_slots);
        mutex_unlock(&kprobe_insn_mutex);
        return ret;
 }
@@ -218,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
                 * so as not to have to set it up again the
                 * next time somebody inserts a probe.
                 */
-                if (!list_is_singular(&kprobe_insn_pages)) {
+                if (!list_is_singular(&kip->list)) {
                        list_del(&kip->list);
                        module_free(NULL, kip->insns);
                        kfree(kip);
@@ -228,52 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
        return 0;
 }
-static int __kprobes collect_garbage_slots(void)
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip, *next;
-        /* Ensure no-one is preepmted on the garbages */
+        /* Ensure no-one is interrupted on the garbages */
-        if (check_safety())
+        synchronize_sched();
-                return -EAGAIN;
-        list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
+        list_for_each_entry_safe(kip, next, &c->pages, list) {
                int i;
                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;      /* we will collect all garbages */
-                for (i = 0; i < INSNS_PER_PAGE; i++) {
+                for (i = 0; i < slots_per_page(c); i++) {
                        if (kip->slot_used[i] == SLOT_DIRTY &&
                            collect_one_slot(kip, i))
                                break;
                }
        }
-        kprobe_garbage_slots = 0;
+        c->nr_garbage = 0;
        return 0;
 }
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+                                       kprobe_opcode_t *slot, int dirty)
 {
        struct kprobe_insn_page *kip;
-        mutex_lock(&kprobe_insn_mutex);
+        list_for_each_entry(kip, &c->pages, list) {
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+                long idx = ((long)slot - (long)kip->insns) /
-                if (kip->insns <= slot &&
+                                (c->insn_size * sizeof(kprobe_opcode_t));
-                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
+                if (idx >= 0 && idx < slots_per_page(c)) {
-                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
+                        WARN_ON(kip->slot_used[idx] != SLOT_USED);
                        if (dirty) {
-                                kip->slot_used[i] = SLOT_DIRTY;
+                                kip->slot_used[idx] = SLOT_DIRTY;
                                kip->ngarbage++;
+                                if (++c->nr_garbage > slots_per_page(c))
+                                        collect_garbage_slots(c);
                        } else
-                                collect_one_slot(kip, i);
+                                collect_one_slot(kip, idx);
-                        break;
+                        return;
                }
        }
+        /* Could not free this slot. */
+        WARN_ON(1);
+}
-        if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
-                collect_garbage_slots();
+{
+        mutex_lock(&kprobe_insn_mutex);
+        __free_insn_slot(&kprobe_insn_slots, slot, dirty);
        mutex_unlock(&kprobe_insn_mutex);
 }
+#ifdef CONFIG_OPTPROBES
+/* For optimized_kprobe buffer */
+static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
+static struct kprobe_insn_cache kprobe_optinsn_slots = {
+        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
+        /* .insn_size is initialized later */
+        .nr_garbage = 0,
+};
+/* Get a slot for optimized_kprobe buffer */
+kprobe_opcode_t __kprobes *get_optinsn_slot(void)
+{
+        kprobe_opcode_t *ret = NULL;
+        mutex_lock(&kprobe_optinsn_mutex);
+        ret = __get_insn_slot(&kprobe_optinsn_slots);
+        mutex_unlock(&kprobe_optinsn_mutex);
+        return ret;
+}
+void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
+{
+        mutex_lock(&kprobe_optinsn_mutex);
+        __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
+        mutex_unlock(&kprobe_optinsn_mutex);
+}
+#endif
 #endif
 /* We have preemption disabled.. so it is safe to use __ versions */
@@ -304,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
                if (p->addr == addr)
                        return p;
        }
        return NULL;
 }
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+/* Return true if the kprobe is an aggregator */
+static inline int kprobe_aggrprobe(struct kprobe *p)
+{
+        return p->pre_handler == aggr_pre_handler;
+}
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+#ifdef CONFIG_OPTPROBES
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_allow_optimization;
+/*
+ * Call all pre_handler on the list, but ignores its return value.
+ * This must be called from arch-dep optimized caller.
+ */
+void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &p->list, list) {
+                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+                        set_kprobe_instance(kp);
+                        kp->pre_handler(kp, regs);
+                }
+                reset_kprobe_instance();
+        }
+}
+/* Return true(!0) if the kprobe is ready for optimization. */
+static inline int kprobe_optready(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                return arch_prepared_optinsn(&op->optinsn);
+        }
+        return 0;
+}
+/*
+ * Return an optimized kprobe whose optimizing code replaces
+ * instructions including addr (exclude breakpoint).
+ */
+struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+{
+        int i;
+        struct kprobe *p = NULL;
+        struct optimized_kprobe *op;
+        /* Don't check i == 0, since that is a breakpoint case. */
+        for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
+                p = get_kprobe((void *)(addr - i));
+        if (p && kprobe_optready(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (arch_within_optimized_kprobe(op, addr))
+                        return p;
+        }
+        return NULL;
+}
+/* Optimization staging list, protected by kprobe_mutex */
+static LIST_HEAD(optimizing_list);
+static void kprobe_optimizer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+#define OPTIMIZE_DELAY 5
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        if (kprobes_all_disarmed || !kprobes_allow_optimization)
+                goto end;
+        /*
+         * Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /*
+         * The optimization/unoptimization refers online_cpus via
+         * stop_machine() and cpu-hotplug modifies online_cpus.
+         * And same time, text_mutex will be held in cpu-hotplug and here.
+         * This combination can cause a deadlock (cpu-hotplug try to lock
+         * text_mutex but stop_machine can not be done because online_cpus
+         * has been changed)
+         * To avoid this deadlock, we need to call get_online_cpus()
+         * for preventing cpu-hotplug outside of text_mutex locking.
+         */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+                WARN_ON(kprobe_disabled(&op->kp));
+                if (arch_optimize_kprobe(op) < 0)
+                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+                list_del_init(&op->list);
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+end:
+        mutex_unlock(&kprobe_mutex);
+        mutex_unlock(&module_mutex);
+}
+/* Optimize kprobe if p is ready to be optimized */
+static __kprobes void optimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* Check if the kprobe is disabled or not ready for optimization. */
+        if (!kprobe_optready(p) || !kprobes_allow_optimization ||
+            (kprobe_disabled(p) || kprobes_all_disarmed))
+                return;
+        /* Both of break_handler and post_handler are not supported. */
+        if (p->break_handler || p->post_handler)
+                return;
+        op = container_of(p, struct optimized_kprobe, kp);
+        /* Check there is no other kprobes at the optimized instructions */
+        if (arch_check_optimized_kprobe(op) < 0)
+                return;
+        /* Check if it is already optimized. */
+        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+                return;
+        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
+        list_add(&op->list, &optimizing_list);
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Unoptimize a kprobe if p is optimized */
+static __kprobes void unoptimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        /* Dequeue from the optimization queue */
+                        list_del_init(&op->list);
+                else
+                        /* Replace jump with break */
+                        arch_unoptimize_kprobe(op);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+}
+/* Remove optimized instructions */
+static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        /* Don't unoptimize, because the target code will be freed. */
+        arch_remove_optimized_kprobe(op);
+}
+/* Try to prepare optimized instructions */
+static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_prepare_optimized_kprobe(op);
+}
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        kfree(op);
+}
+/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
+        if (!op)
+                return NULL;
+        INIT_LIST_HEAD(&op->list);
+        op->kp.addr = p->addr;
+        arch_prepare_optimized_kprobe(op);
+        return &op->kp;
+}
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+/*
+ * Prepare an optimized_kprobe and optimize it
+ * NOTE: p must be a normal registered kprobe
+ */
+static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+{
+        struct kprobe *ap;
+        struct optimized_kprobe *op;
+        ap = alloc_aggr_kprobe(p);
+        if (!ap)
+                return;
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (!arch_prepared_optinsn(&op->optinsn)) {
+                /* If failed to setup optimizing, fallback to kprobe */
+                free_aggr_kprobe(ap);
+                return;
+        }
+        init_aggr_kprobe(ap, p);
+        optimize_kprobe(ap);
+}
+#ifdef CONFIG_SYSCTL
+static void __kprobes optimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already allowed, just return */
+        if (kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = true;
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist)
+                        if (!kprobe_disabled(p))
+                                optimize_kprobe(p);
+        }
+        mutex_unlock(&text_mutex);
+        printk(KERN_INFO "Kprobes globally optimized\n");
+}
+static void __kprobes unoptimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already prohibited, just return */
+        if (!kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = false;
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                        if (!kprobe_disabled(p))
+                                unoptimize_kprobe(p);
+                }
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+        /* Allow all currently running kprobes to complete */
+        synchronize_sched();
+}
+int sysctl_kprobes_optimization;
+int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
+                                      void __user *buffer, size_t *length,
+                                      loff_t *ppos)
+{
+        int ret;
+        mutex_lock(&kprobe_mutex);
+        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
+        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (sysctl_kprobes_optimization)
+                optimize_all_kprobes();
+        else
+                unoptimize_all_kprobes();
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
+#endif /* CONFIG_SYSCTL */
+static void __kprobes __arm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        /* Check collision with other optimized kprobes */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+        arch_arm_kprobe(p);
+        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
+}
+static void __kprobes __disarm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        unoptimize_kprobe(p);   /* Try to unoptimize */
+        arch_disarm_kprobe(p);
+        /* If another kprobe was blocked, optimize it. */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                optimize_kprobe(old_p);
+}
+#else /* !CONFIG_OPTPROBES */
+#define optimize_kprobe(p)                      do {} while (0)
+#define unoptimize_kprobe(p)                    do {} while (0)
+#define kill_optimized_kprobe(p)                do {} while (0)
+#define prepare_optimized_kprobe(p)             do {} while (0)
+#define try_to_optimize_kprobe(p)               do {} while (0)
+#define __arm_kprobe(p)                         arch_arm_kprobe(p)
+#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        kfree(p);
+}
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+}
+#endif /* CONFIG_OPTPROBES */
 /* Arm a kprobe with text_mutex */
 static void __kprobes arm_kprobe(struct kprobe *kp)
 {
+        /*
+         * Here, since __arm_kprobe() doesn't use stop_machine(),
+         * this doesn't cause deadlock on text_mutex. So, we don't
+         * need get_online_cpus().
+         */
        mutex_lock(&text_mutex);
-        arch_arm_kprobe(kp);
+        __arm_kprobe(kp);
        mutex_unlock(&text_mutex);
 }
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
        mutex_lock(&text_mutex);
-        arch_disarm_kprobe(kp);
+        __disarm_kprobe(kp);
        mutex_unlock(&text_mutex);
+        put_online_cpus();
 }
 /*
@@ -389,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 {
        struct kprobe *kp;
-        if (p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(p)) {
                p->nmissed++;
        } else {
                list_for_each_entry_rcu(kp, &p->list, list)
@@ -513,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 }
 /*
- * Keep all fields in the kprobe consistent
- */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
-{
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
-}
-/*
 * Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
+        if (p->break_handler || p->post_handler)
+                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
                        return -EEXIST;
@@ -542,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->flags &= ~KPROBE_FLAG_DISABLED;
                if (!kprobes_all_disarmed)
                        /* Arm the breakpoint again. */
-                        arm_kprobe(ap);
+                        __arm_kprobe(ap);
        }
        return 0;
 }
@@ -551,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
-static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+        /* Copy p's insn slot to ap */
        copy_kprobe(p, ap);
        flush_insn_slot(ap);
        ap->addr = p->addr;
-        ap->flags = p->flags;
+        ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
        /* We don't care the kprobe which has gone. */
@@ -566,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->break_handler = aggr_break_handler;
        INIT_LIST_HEAD(&ap->list);
-        list_add_rcu(&p->list, &ap->list);
+        INIT_HLIST_NODE(&ap->hlist);
+        list_add_rcu(&p->list, &ap->list);
        hlist_replace_rcu(&p->hlist, &ap->hlist);
 }
@@ -581,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        int ret = 0;
        struct kprobe *ap = old_p;
-        if (old_p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(old_p)) {
-                /* If old_p is not an aggr_probe, create new aggr_kprobe. */
+                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+                ap = alloc_aggr_kprobe(old_p);
                if (!ap)
                        return -ENOMEM;
-                add_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, old_p);
        }
        if (kprobe_gone(ap)) {
@@ -605,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                         */
                        return ret;
+                /* Prepare optimized instructions if possible. */
+                prepare_optimized_kprobe(ap);
                /*
                 * Clear gone flag to prevent allocating new slot again, and
                 * set disabled flag because it is not armed yet.
@@ -613,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                            | KPROBE_FLAG_DISABLED;
        }
+        /* Copy ap's insn slot to p */
        copy_kprobe(ap, p);
        return add_new_kprobe(ap, p);
 }
@@ -673,6 +1088,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
        return (kprobe_opcode_t *)(((char *)addr) + p->offset);
 }
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p, *list_p;
+        old_p = get_kprobe(p->addr);
+        if (unlikely(!old_p))
+                return NULL;
+        if (p != old_p) {
+                list_for_each_entry_rcu(list_p, &old_p->list, list)
+                        if (list_p == p)
+                        /* kprobe p is a valid probe */
+                                goto valid;
+                return NULL;
+        }
+valid:
+        return old_p;
+}
+/* Return error if the kprobe is being re-registered */
+static inline int check_kprobe_rereg(struct kprobe *p)
+{
+        int ret = 0;
+        struct kprobe *old_p;
+        mutex_lock(&kprobe_mutex);
+        old_p = __get_valid_kprobe(p);
+        if (old_p)
+                ret = -EINVAL;
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
 int __kprobes register_kprobe(struct kprobe *p)
 {
        int ret = 0;
@@ -685,9 +1134,14 @@ int __kprobes register_kprobe(struct kprobe *p)
                return -EINVAL;
        p->addr = addr;
+        ret = check_kprobe_rereg(p);
+        if (ret)
+                return ret;
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
-            in_kprobes_functions((unsigned long) p->addr)) {
+            in_kprobes_functions((unsigned long) p->addr) ||
+            ftrace_text_reserved(p->addr, p->addr)) {
                preempt_enable();
                return -EINVAL;
        }
@@ -724,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
        mutex_lock(&kprobe_mutex);
+        get_online_cpus();      /* For avoiding text_mutex deadlock. */
+        mutex_lock(&text_mutex);
        old_p = get_kprobe(p->addr);
        if (old_p) {
+                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);
                goto out;
        }
-        mutex_lock(&text_mutex);
        ret = arch_prepare_kprobe(p);
        if (ret)
-                goto out_unlock_text;
+                goto out;
        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        if (!kprobes_all_disarmed && !kprobe_disabled(p))
-                arch_arm_kprobe(p);
+                __arm_kprobe(p);
+        /* Try to optimize kprobe */
+        try_to_optimize_kprobe(p);
-out_unlock_text:
-        mutex_unlock(&text_mutex);
 out:
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        if (probed_mod)
@@ -754,26 +1215,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
-{
-        struct kprobe *old_p, *list_p;
-        old_p = get_kprobe(p->addr);
-        if (unlikely(!old_p))
-                return NULL;
-        if (p != old_p) {
-                list_for_each_entry_rcu(list_p, &old_p->list, list)
-                        if (list_p == p)
-                        /* kprobe p is a valid probe */
-                                goto valid;
-                return NULL;
-        }
-valid:
-        return old_p;
-}
 /*
 * Unregister a kprobe without a scheduler synchronization.
 */
@@ -786,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                return -EINVAL;
        if (old_p == p ||
-            (old_p->pre_handler == aggr_pre_handler &&
+            (kprobe_aggrprobe(old_p) &&
             list_is_singular(&old_p->list))) {
                /*
                 * Only probe on the hash list. Disarm only if kprobes are
@@ -794,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                 * already have been removed. We save on flushing icache.
                 */
                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
-                        disarm_kprobe(p);
+                        disarm_kprobe(old_p);
                hlist_del_rcu(&old_p->hlist);
        } else {
                if (p->break_handler && !kprobe_gone(p))
@@ -810,8 +1251,13 @@ noclean:
                list_del_rcu(&p->list);
                if (!kprobe_disabled(old_p)) {
                        try_to_disable_aggr_kprobe(old_p);
-                        if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+                        if (!kprobes_all_disarmed) {
-                                disarm_kprobe(old_p);
+                                if (kprobe_disabled(old_p))
+                                        disarm_kprobe(old_p);
+                                else
+                                        /* Try to optimize this probe again */
+                                        optimize_kprobe(old_p);
+                        }
                }
        }
        return 0;
@@ -828,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
                old_p = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
                arch_remove_kprobe(old_p);
-                kfree(old_p);
+                free_aggr_kprobe(old_p);
        }
 }
@@ -1014,9 +1460,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
 #ifdef CONFIG_PREEMPT
-                rp->maxactive = max(10, 2 * NR_CPUS);
+                rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
 #else
-                rp->maxactive = NR_CPUS;
+                rp->maxactive = num_possible_cpus();
 #endif
        }
        spin_lock_init(&rp->lock);
@@ -1124,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        struct kprobe *kp;
        p->flags |= KPROBE_FLAG_GONE;
-        if (p->pre_handler == aggr_pre_handler) {
+        if (kprobe_aggrprobe(p)) {
                /*
                 * If this is an aggr_kprobe, we have to list all the
                 * chained probes and mark them GONE.
@@ -1133,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
                        kp->flags |= KPROBE_FLAG_GONE;
                p->post_handler = NULL;
                p->break_handler = NULL;
+                kill_optimized_kprobe(p);
        }
        /*
         * Here, we can remove insn_slot safely, because no thread calls
@@ -1141,6 +1588,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        arch_remove_kprobe(p);
 }
+void __kprobes dump_kprobe(struct kprobe *kp)
+{
+        printk(KERN_WARNING "Dumping kprobe:\n");
+        printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+               kp->symbol_name, kp->addr, kp->offset);
+}
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
                                             unsigned long val, void *data)
@@ -1235,6 +1689,15 @@ static int __init init_kprobes(void)
                }
        }
+#if defined(CONFIG_OPTPROBES)
+#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+        /* Init kprobe_optinsn_slots */
+        kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+        /* By default, kprobes can be optimized */
+        kprobes_allow_optimization = true;
+#endif
        /* By default, kprobes are armed */
        kprobes_all_disarmed = false;
@@ -1253,7 +1716,7 @@ static int __init init_kprobes(void)
 #ifdef CONFIG_DEBUG_FS
 static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
-                const char *sym, int offset,char *modname)
+                const char *sym, int offset, char *modname, struct kprobe *pp)
 {
        char *kprobe_type;
@@ -1263,19 +1726,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
                kprobe_type = "j";
        else
                kprobe_type = "k";
        if (sym)
-                seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+                seq_printf(pi, "%p  %s  %s+0x%x  %s ",
                        p->addr, kprobe_type, sym, offset,
-                        (modname ? modname : " "),
+                        (modname ? modname : " "));
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
-                         "[DISABLED]" : ""));
        else
-                seq_printf(pi, "%p  %s  %p %s%s\n",
+                seq_printf(pi, "%p  %s  %p ",
-                        p->addr, kprobe_type, p->addr,
+                        p->addr, kprobe_type, p->addr);
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+        if (!pp)
-                         "[DISABLED]" : ""));
+                pp = p;
+        seq_printf(pi, "%s%s%s\n",
+                (kprobe_gone(p) ? "[GONE]" : ""),
+                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
+                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
 }
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1311,11 +1776,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        hlist_for_each_entry_rcu(p, node, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
-                if (p->pre_handler == aggr_pre_handler) {
+                if (kprobe_aggrprobe(p)) {
                        list_for_each_entry_rcu(kp, &p->list, list)
-                                report_probe(pi, kp, sym, offset, modname);
+                                report_probe(pi, kp, sym, offset, modname, p);
                } else
-                        report_probe(pi, p, sym, offset, modname);
+                        report_probe(pi, p, sym, offset, modname, NULL);
        }
        preempt_enable();
        return 0;
@@ -1393,12 +1858,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
                goto out;
        }
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                arm_kprobe(p);
-        p->flags &= ~KPROBE_FLAG_DISABLED;
        if (p != kp)
                kp->flags &= ~KPROBE_FLAG_DISABLED;
+        if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+                p->flags &= ~KPROBE_FLAG_DISABLED;
+                arm_kprobe(p);
+        }
 out:
        mutex_unlock(&kprobe_mutex);
        return ret;
@@ -1418,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
        if (!kprobes_all_disarmed)
                goto already_enabled;
+        /* Arming kprobes doesn't optimize kprobe itself */
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
-                                arch_arm_kprobe(p);
+                                __arm_kprobe(p);
        }
        mutex_unlock(&text_mutex);
@@ -1450,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
+        /*
+         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
+         * because disarming may also unoptimize kprobes.
+         */
+        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                arch_disarm_kprobe(p);
+                                __disarm_kprobe(p);
                }
        }
        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        /* Allow all currently running kprobes to complete */
        synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7e..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(uevent_seqnum);
-/* uevent helper program, used during early boo */
+/* uevent helper program, used during early boot */
 static ssize_t uevent_helper_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
 {
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(kexec_crash_loaded);
+static ssize_t kexec_crash_size_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%zu\n", crash_get_memory_size());
+}
+static ssize_t kexec_crash_size_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        unsigned long cnt;
+        int ret;
+        if (strict_strtoul(buf, 0, &cnt))
+                return -EINVAL;
+        ret = crash_shrink_memory(cnt);
+        return ret < 0 ? ret : count;
+}
+KERNEL_ATTR_RW(kexec_crash_size);
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
 {
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_KEXEC
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
+        &kexec_crash_size_attr.attr,
        &vmcoreinfo_attr.attr,
 #endif
        NULL
@@ -176,16 +197,8 @@ static int __init ksysfs_init(void)
                        goto group_exit;
        }
-        /* create the /sys/kernel/uids/ directory */
-        error = uids_sysfs_init();
-        if (error)
-                goto notes_exit;
        return 0;
-notes_exit:
-        if (notes_size > 0)
-                sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
        sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ab7ae57773e1..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
+ * it.  See also kthread_run().
 *
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 EXPORT_SYMBOL(kthread_create);
 /**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @p: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create()).
+ */
+void kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+        /* Must have done schedule() in kthread() before we set_task_cpu */
+        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
+                WARN_ON(1);
+                return;
+        }
+        p->cpus_allowed = cpumask_of_cpu(cpu);
+        p->rt.nr_cpus_allowed = 1;
+        p->flags |= PF_THREAD_BOUND;
+}
+EXPORT_SYMBOL(kthread_bind);
+/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
@@ -196,7 +219,7 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
-        set_mems_allowed(node_possible_map);
+        set_mems_allowed(node_states[N_HIGH_MEMORY]);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/list.h>
-#include <linux/slab.h>
 #include <linux/stacktrace.h>
 static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c096..2594e1ce41cb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,13 +43,14 @@
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
+#include <linux/gfp.h>
 #include <asm/sections.h>
 #include "lockdep_internals.h"
 #define CREATE_TRACE_POINTS
-#include <trace/events/lockdep.h>
+#include <trace/events/lock.h>
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
@@ -73,11 +74,11 @@ module_param(lock_stat, int, 0644);
 * to use a raw spinlock - we really dont want the spinlock
 * code to recurse back into the lockdep code...
 */
-static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 static int graph_lock(void)
 {
-        __raw_spin_lock(&lockdep_lock);
+        arch_spin_lock(&lockdep_lock);
        /*
         * Make sure that if another CPU detected a bug while
         * walking the graph we dont change it (while the other
@@ -85,7 +86,7 @@ static int graph_lock(void)
         * dropped already)
         */
        if (!debug_locks) {
-                __raw_spin_unlock(&lockdep_lock);
+                arch_spin_unlock(&lockdep_lock);
                return 0;
        }
        /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +96,11 @@ static int graph_lock(void)
 static inline int graph_unlock(void)
 {
-        if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
+        if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
                return DEBUG_LOCKS_WARN_ON(1);
        current->lockdep_recursion--;
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        return 0;
 }
@@ -111,7 +112,7 @@ static inline int debug_locks_off_graph_unlock(void)
 {
        int ret = debug_locks_off();
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        return ret;
 }
@@ -140,7 +141,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 }
 #ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
+                      cpu_lock_stats);
 static inline u64 lockstat_clock(void)
 {
@@ -168,7 +170,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
        if (time > lt->max)
                lt->max = time;
-        if (time < lt->min || !lt->min)
+        if (time < lt->min || !lt->nr)
                lt->min = time;
        lt->total += time;
@@ -177,8 +179,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
 static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
 {
-        dst->min += src->min;
+        if (!src->nr)
-        dst->max += src->max;
+                return;
+        if (src->max > dst->max)
+                dst->max = src->max;
+        if (src->min < dst->min || !dst->nr)
+                dst->min = src->min;
        dst->total += src->total;
        dst->nr += src->nr;
 }
@@ -191,7 +200,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
        memset(&stats, 0, sizeof(struct lock_class_stats));
        for_each_possible_cpu(cpu) {
                struct lock_class_stats *pcs =
-                        &per_cpu(lock_stats, cpu)[class - lock_classes];
+                        &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
                for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
                        stats.contention_point[i] += pcs->contention_point[i];
@@ -218,7 +227,7 @@ void clear_lock_stats(struct lock_class *class)
        for_each_possible_cpu(cpu) {
                struct lock_class_stats *cpu_stats =
-                        &per_cpu(lock_stats, cpu)[class - lock_classes];
+                        &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
                memset(cpu_stats, 0, sizeof(struct lock_class_stats));
        }
@@ -228,12 +237,12 @@ void clear_lock_stats(struct lock_class *class)
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
 {
-        return &get_cpu_var(lock_stats)[class - lock_classes];
+        return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
 }
 static void put_lock_stats(struct lock_class_stats *stats)
 {
-        put_cpu_var(lock_stats);
+        put_cpu_var(cpu_lock_stats);
 }
 static void lock_release_holdtime(struct held_lock *hlock)
@@ -379,7 +388,8 @@ static int save_trace(struct stack_trace *trace)
         * complete trace that maxes out the entries provided will be reported
         * as incomplete, friggin useless </rant>
         */
-        if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+        if (trace->nr_entries != 0 &&
+            trace->entries[trace->nr_entries-1] == ULONG_MAX)
                trace->nr_entries--;
        trace->max_entries = trace->nr_entries;
@@ -573,9 +583,6 @@ static int static_obj(void *obj)
        unsigned long start = (unsigned long) &_stext,
                      end   = (unsigned long) &_end,
                      addr  = (unsigned long) obj;
-#ifdef CONFIG_SMP
-        int i;
-#endif
        /*
         * static variable?
@@ -586,24 +593,16 @@ static int static_obj(void *obj)
        if (arch_is_kernel_data(addr))
                return 1;
-#ifdef CONFIG_SMP
        /*
-         * percpu var?
+         * in-kernel percpu var?
         */
-        for_each_possible_cpu(i) {
+        if (is_kernel_percpu_address(addr))
-                start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+                return 1;
-                end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
-                                        + per_cpu_offset(i);
-                if ((addr >= start) && (addr < end))
-                        return 1;
-        }
-#endif
        /*
-         * module var?
+         * module static or percpu var?
         */
-        return is_module_address(addr);
+        return is_module_address(addr) || is_module_percpu_address(addr);
 }
 /*
@@ -1161,9 +1160,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
        this.class = class;
        local_irq_save(flags);
-        __raw_spin_lock(&lockdep_lock);
+        arch_spin_lock(&lockdep_lock);
        ret = __lockdep_count_forward_deps(&this);
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        local_irq_restore(flags);
        return ret;
@@ -1188,9 +1187,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
        this.class = class;
        local_irq_save(flags);
-        __raw_spin_lock(&lockdep_lock);
+        arch_spin_lock(&lockdep_lock);
        ret = __lockdep_count_backward_deps(&this);
-        __raw_spin_unlock(&lockdep_lock);
+        arch_spin_unlock(&lockdep_lock);
        local_irq_restore(flags);
        return ret;
@@ -2138,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
                return ret;
        return print_irq_inversion_bug(curr, &root, target_entry,
-                                        this, 1, irqclass);
+                                        this, 0, irqclass);
 }
 void print_irqtrace_events(struct task_struct *curr)
@@ -3202,8 +3201,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
        unsigned long flags;
-        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        if (unlikely(current->lockdep_recursion))
                return;
@@ -3211,6 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        __lock_acquire(lock, subclass, trylock, read, check,
                       irqs_disabled_flags(flags), nest_lock, ip, 0);
        current->lockdep_recursion = 0;
@@ -3223,14 +3221,13 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
        unsigned long flags;
-        trace_lock_release(lock, nested, ip);
        if (unlikely(current->lockdep_recursion))
                return;
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_release(lock, nested, ip);
        __lock_release(lock, nested, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3404,8 +3401,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
-        trace_lock_contended(lock, ip);
        if (unlikely(!lock_stat))
                return;
@@ -3415,6 +3410,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_contended(lock, ip);
        __lock_contended(lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3800,3 +3796,22 @@ void lockdep_sys_exit(void)
                lockdep_print_held_locks(curr);
        }
 }
+void lockdep_rcu_dereference(const char *file, const int line)
+{
+        struct task_struct *curr = current;
+        if (!debug_locks_off())
+                return;
+        printk("\n===================================================\n");
+        printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
+        printk(  "---------------------------------------------------\n");
+        printk("%s:%d invoked rcu_dereference_check() without protection!\n",
+                        file, line);
+        printk("\nother info that might help us debug this:\n\n");
+        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+        lockdep_print_held_locks(curr);
+        printk("\nstack backtrace:\n");
+        dump_stack();
+}
+EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index 5842a71cf052..1016b75b026a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,204 +370,98 @@ EXPORT_SYMBOL_GPL(find_module);
 #ifdef CONFIG_SMP
-#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
+static inline void __percpu *mod_percpu(struct module *mod)
-static void *percpu_modalloc(unsigned long size, unsigned long align,
-                             const char *name)
 {
-        void *ptr;
+        return mod->percpu;
+}
+static int percpu_modalloc(struct module *mod,
+                           unsigned long size, unsigned long align)
+{
        if (align > PAGE_SIZE) {
                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                       name, align, PAGE_SIZE);
+                       mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }
-        ptr = __alloc_reserved_percpu(size, align);
+        mod->percpu = __alloc_reserved_percpu(size, align);
-        if (!ptr)
+        if (!mod->percpu) {
                printk(KERN_WARNING
                       "Could not allocate %lu bytes percpu data\n", size);
-        return ptr;
+                return -ENOMEM;
-}
-static void percpu_modfree(void *freeme)
-{
-        free_percpu(freeme);
-}
-#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
-/* Number of blocks used and allocated. */
-static unsigned int pcpu_num_used, pcpu_num_allocated;
-/* Size of each block.  -ve means used. */
-static int *pcpu_size;
-static int split_block(unsigned int i, unsigned short size)
-{
-        /* Reallocation required? */
-        if (pcpu_num_used + 1 > pcpu_num_allocated) {
-                int *new;
-                new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
-                               GFP_KERNEL);
-                if (!new)
-                        return 0;
-                pcpu_num_allocated *= 2;
-                pcpu_size = new;
        }
+        mod->percpu_size = size;
-        /* Insert a new subblock */
+        return 0;
-        memmove(&pcpu_size[i+1], &pcpu_size[i],
-                sizeof(pcpu_size[0]) * (pcpu_num_used - i));
-        pcpu_num_used++;
-        pcpu_size[i+1] -= size;
-        pcpu_size[i] = size;
-        return 1;
 }
-static inline unsigned int block_size(int val)
+static void percpu_modfree(struct module *mod)
 {
-        if (val < 0)
+        free_percpu(mod->percpu);
-                return -val;
-        return val;
 }
-static void *percpu_modalloc(unsigned long size, unsigned long align,
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-                             const char *name)
+                                 Elf_Shdr *sechdrs,
+                                 const char *secstrings)
 {
-        unsigned long extra;
+        return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
-        unsigned int i;
-        void *ptr;
-        int cpu;
-        if (align > PAGE_SIZE) {
-                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                       name, align, PAGE_SIZE);
-                align = PAGE_SIZE;
-        }
-        ptr = __per_cpu_start;
-        for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
-                /* Extra for alignment requirement. */
-                extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
-                BUG_ON(i == 0 && extra != 0);
-                if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
-                        continue;
-                /* Transfer extra to previous block. */
-                if (pcpu_size[i-1] < 0)
-                        pcpu_size[i-1] -= extra;
-                else
-                        pcpu_size[i-1] += extra;
-                pcpu_size[i] -= extra;
-                ptr += extra;
-                /* Split block if warranted */
-                if (pcpu_size[i] - size > sizeof(unsigned long))
-                        if (!split_block(i, size))
-                                return NULL;
-                /* add the per-cpu scanning areas */
-                for_each_possible_cpu(cpu)
-                        kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
-                                       GFP_KERNEL);
-                /* Mark allocated */
-                pcpu_size[i] = -pcpu_size[i];
-                return ptr;
-        }
-        printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
-               size);
-        return NULL;
 }
-static void percpu_modfree(void *freeme)
+static void percpu_modcopy(struct module *mod,
+                           const void *from, unsigned long size)
 {
-        unsigned int i;
-        void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
        int cpu;
-        /* First entry is core kernel percpu data. */
-        for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
-                if (ptr == freeme) {
-                        pcpu_size[i] = -pcpu_size[i];
-                        goto free;
-                }
-        }
-        BUG();
- free:
-        /* remove the per-cpu scanning areas */
        for_each_possible_cpu(cpu)
-                kmemleak_free(freeme + per_cpu_offset(cpu));
+                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
-        /* Merge with previous? */
-        if (pcpu_size[i-1] >= 0) {
-                pcpu_size[i-1] += pcpu_size[i];
-                pcpu_num_used--;
-                memmove(&pcpu_size[i], &pcpu_size[i+1],
-                        (pcpu_num_used - i) * sizeof(pcpu_size[0]));
-                i--;
-        }
-        /* Merge with next? */
-        if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
-                pcpu_size[i] += pcpu_size[i+1];
-                pcpu_num_used--;
-                memmove(&pcpu_size[i+1], &pcpu_size[i+2],
-                        (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
-        }
 }
-static int percpu_modinit(void)
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
 {
-        pcpu_num_used = 2;
+        struct module *mod;
-        pcpu_num_allocated = 2;
+        unsigned int cpu;
-        pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
-                            GFP_KERNEL);
-        /* Static in-kernel percpu data (used). */
-        pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
-        /* Free room. */
-        pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
-        if (pcpu_size[1] < 0) {
-                printk(KERN_ERR "No per-cpu room for modules.\n");
-                pcpu_num_used = 1;
-        }
-        return 0;
-}
-__initcall(percpu_modinit);
-#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
+        preempt_disable();
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+        list_for_each_entry_rcu(mod, &modules, list) {
-                                 Elf_Shdr *sechdrs,
+                if (!mod->percpu_size)
-                                 const char *secstrings)
+                        continue;
-{
+                for_each_possible_cpu(cpu) {
-        return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+                        void *start = per_cpu_ptr(mod->percpu, cpu);
-}
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+                        if ((void *)addr >= start &&
-{
+                            (void *)addr < start + mod->percpu_size) {
-        int cpu;
+                                preempt_enable();
+                                return true;
+                        }
+                }
+        }
-        for_each_possible_cpu(cpu)
+        preempt_enable();
-                memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+        return false;
 }
 #else /* ... !CONFIG_SMP */
-static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+static inline void __percpu *mod_percpu(struct module *mod)
-                                    const char *name)
 {
        return NULL;
 }
-static inline void percpu_modfree(void *pcpuptr)
+static inline int percpu_modalloc(struct module *mod,
+                                  unsigned long size, unsigned long align)
+{
+        return -ENOMEM;
+}
+static inline void percpu_modfree(struct module *mod)
 {
-        BUG();
 }
 static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                        Elf_Shdr *sechdrs,
@@ -575,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
 {
        return 0;
 }
-static inline void percpu_modcopy(void *pcpudst, const void *src,
+static inline void percpu_modcopy(struct module *mod,
-                                  unsigned long size)
+                                  const void *from, unsigned long size)
 {
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
 }
+bool is_module_percpu_address(unsigned long addr)
+{
+        return false;
+}
 #endif /* CONFIG_SMP */
@@ -623,10 +521,13 @@ static void module_unload_init(struct module *mod)
        int cpu;
        INIT_LIST_HEAD(&mod->modules_which_use_me);
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                local_set(__module_ref_addr(mod, cpu), 0);
+                per_cpu_ptr(mod->refptr, cpu)->incs = 0;
+                per_cpu_ptr(mod->refptr, cpu)->decs = 0;
+        }
        /* Hold reference count during initialization. */
-        local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
+        __this_cpu_write(mod->refptr->incs, 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
 }
@@ -765,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 unsigned int module_refcount(struct module *mod)
 {
-        unsigned int total = 0;
+        unsigned int incs = 0, decs = 0;
        int cpu;
        for_each_possible_cpu(cpu)
-                total += local_read(__module_ref_addr(mod, cpu));
+                decs += per_cpu_ptr(mod->refptr, cpu)->decs;
-        return total;
+        /*
+         * ensure the incs are added up after the decs.
+         * module_put ensures incs are visible before decs with smp_wmb.
+         *
+         * This 2-count scheme avoids the situation where the refcount
+         * for CPU0 is read, then CPU0 increments the module refcount,
+         * then CPU1 drops that refcount, then the refcount for CPU1 is
+         * read. We would record a decrement but not its corresponding
+         * increment so we would see a low count (disaster).
+         *
+         * Rare situation? But module_refcount can be preempted, and we
+         * might be tallying up 4096+ CPUs. So it is not impossible.
+         */
+        smp_rmb();
+        for_each_possible_cpu(cpu)
+                incs += per_cpu_ptr(mod->refptr, cpu)->incs;
+        return incs - decs;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -946,14 +863,16 @@ static struct module_attribute refcnt = {
 void module_put(struct module *module)
 {
        if (module) {
-                unsigned int cpu = get_cpu();
+                preempt_disable();
-                local_dec(__module_ref_addr(module, cpu));
+                smp_wmb(); /* see comment in module_refcount */
+                __this_cpu_inc(module->refptr->decs);
                trace_module_put(module, _RET_IP_,
-                                 local_read(__module_ref_addr(module, cpu)));
+                                 __this_cpu_read(module->refptr->decs));
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
-                put_cpu();
+                preempt_enable();
        }
 }
 EXPORT_SYMBOL(module_put);
@@ -1030,11 +949,23 @@ static int try_to_force_load(struct module *mod, const char *reason)
 }
 #ifdef CONFIG_MODVERSIONS
+/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
+static unsigned long maybe_relocated(unsigned long crc,
+                                     const struct module *crc_owner)
+{
+#ifdef ARCH_RELOCATES_KCRCTAB
+        if (crc_owner == NULL)
+                return crc - (unsigned long)reloc_start;
+#endif
+        return crc;
+}
 static int check_version(Elf_Shdr *sechdrs,
                         unsigned int versindex,
                         const char *symname,
                         struct module *mod, 
-                         const unsigned long *crc)
+                         const unsigned long *crc,
+                         const struct module *crc_owner)
 {
        unsigned int i, num_versions;
        struct modversion_info *versions;
@@ -1055,10 +986,10 @@ static int check_version(Elf_Shdr *sechdrs,
                if (strcmp(versions[i].name, symname) != 0)
                        continue;
-                if (versions[i].crc == *crc)
+                if (versions[i].crc == maybe_relocated(*crc, crc_owner))
                        return 1;
                DEBUGP("Found checksum %lX vs module %lX\n",
-                       *crc, versions[i].crc);
+                       maybe_relocated(*crc, crc_owner), versions[i].crc);
                goto bad_version;
        }
@@ -1081,7 +1012,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
        if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
                         &crc, true, false))
                BUG();
-        return check_version(sechdrs, versindex, "module_layout", mod, crc);
+        return check_version(sechdrs, versindex, "module_layout", mod, crc,
+                             NULL);
 }
 /* First part is kernel version, which we ignore if module has crcs. */
@@ -1099,7 +1031,8 @@ static inline int check_version(Elf_Shdr *sechdrs,
                                unsigned int versindex,
                                const char *symname,
                                struct module *mod, 
-                                const unsigned long *crc)
+                                const unsigned long *crc,
+                                const struct module *crc_owner)
 {
        return 1;
 }
@@ -1134,8 +1067,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
        /* use_module can fail due to OOM,
           or module initialization or unloading */
        if (sym) {
-                if (!check_version(sechdrs, versindex, name, mod, crc) ||
+                if (!check_version(sechdrs, versindex, name, mod, crc, owner)
-                    !use_module(mod, owner))
+                    || !use_module(mod, owner))
                        sym = NULL;
        }
        return sym;
@@ -1146,6 +1079,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
 * J. Corbet <corbet@lwn.net>
 */
 #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+static inline bool sect_empty(const Elf_Shdr *sect)
+{
+        return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+}
 struct module_sect_attr
 {
        struct module_attribute mattr;
@@ -1187,8 +1126,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
        /* Count loaded sections and allocate structures */
        for (i = 0; i < nsect; i++)
-                if (sechdrs[i].sh_flags & SHF_ALLOC
+                if (!sect_empty(&sechdrs[i]))
-                    && sechdrs[i].sh_size)
                        nloaded++;
        size[0] = ALIGN(sizeof(*sect_attrs)
                        + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1206,9 +1144,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
        sattr = &sect_attrs->attrs[0];
        gattr = &sect_attrs->grp.attrs[0];
        for (i = 0; i < nsect; i++) {
-                if (! (sechdrs[i].sh_flags & SHF_ALLOC))
+                if (sect_empty(&sechdrs[i]))
-                        continue;
-                if (!sechdrs[i].sh_size)
                        continue;
                sattr->address = sechdrs[i].sh_addr;
                sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1216,6 +1152,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
                if (sattr->name == NULL)
                        goto out;
                sect_attrs->nsections++;
+                sysfs_attr_init(&sattr->mattr.attr);
                sattr->mattr.show = module_sect_show;
                sattr->mattr.store = NULL;
                sattr->mattr.attr.name = sattr->name;
@@ -1292,7 +1229,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
        /* Count notes sections and allocate structures.  */
        notes = 0;
        for (i = 0; i < nsect; i++)
-                if ((sechdrs[i].sh_flags & SHF_ALLOC) &&
+                if (!sect_empty(&sechdrs[i]) &&
                    (sechdrs[i].sh_type == SHT_NOTE))
                        ++notes;
@@ -1308,9 +1245,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
        notes_attrs->notes = notes;
        nattr = &notes_attrs->attrs[0];
        for (loaded = i = 0; i < nsect; ++i) {
-                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                if (sect_empty(&sechdrs[i]))
                        continue;
                if (sechdrs[i].sh_type == SHT_NOTE) {
+                        sysfs_bin_attr_init(nattr);
                        nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
                        nattr->attr.mode = S_IRUGO;
                        nattr->size = sechdrs[i].sh_size;
@@ -1383,6 +1321,7 @@ int module_add_modinfo_attrs(struct module *mod)
                if (!attr->test ||
                    (attr->test && attr->test(mod))) {
                        memcpy(temp_attr, attr, sizeof(*temp_attr));
+                        sysfs_attr_init(&temp_attr->attr);
                        error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
                        ++temp_attr;
                }
@@ -1528,11 +1467,10 @@ static void free_module(struct module *mod)
        /* This may be NULL, but that's OK */
        module_free(mod, mod->module_init);
        kfree(mod->args);
-        if (mod->percpu)
+        percpu_modfree(mod);
-                percpu_modfree(mod->percpu);
+#if defined(CONFIG_MODULE_UNLOAD)
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
        if (mod->refptr)
-                percpu_modfree(mod->refptr);
+                free_percpu(mod->refptr);
 #endif
        /* Free lock-classes: */
        lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1648,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
                default:
                        /* Divert to percpu allocation if a percpu var. */
                        if (sym[i].st_shndx == pcpuindex)
-                                secbase = (unsigned long)mod->percpu;
+                                secbase = (unsigned long)mod_percpu(mod);
                        else
                                secbase = sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
@@ -2046,9 +1984,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
        unsigned int i;
        /* only scan the sections containing data */
-        kmemleak_scan_area(mod->module_core, (unsigned long)mod -
+        kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-                           (unsigned long)mod->module_core,
-                           sizeof(struct module), GFP_KERNEL);
        for (i = 1; i < hdr->e_shnum; i++) {
                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -2057,8 +1993,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
                    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
                        continue;
-                kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
+                kmemleak_scan_area((void *)sechdrs[i].sh_addr,
-                                   (unsigned long)mod->module_core,
                                   sechdrs[i].sh_size, GFP_KERNEL);
        }
 }
@@ -2085,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod,
        unsigned int modindex, versindex, infoindex, pcpuindex;
        struct module *mod;
        long err = 0;
-        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+        void *ptr = NULL; /* Stops spurious gcc warning */
        unsigned long symoffs, stroffs, *strmap;
        mm_segment_t old_fs;
@@ -2225,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod,
        if (pcpuindex) {
                /* We have a special allocation for this section. */
-                percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+                err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
-                                         sechdrs[pcpuindex].sh_addralign,
+                                      sechdrs[pcpuindex].sh_addralign);
-                                         mod->name);
+                if (err)
-                if (!percpu) {
-                        err = -ENOMEM;
                        goto free_mod;
-                }
                sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-                mod->percpu = percpu;
        }
        /* Determine total sizes, and put offsets in sh_entsize.  For now
@@ -2298,9 +2229,8 @@ static noinline struct module *load_module(void __user *umod,
        mod = (void *)sechdrs[modindex].sh_addr;
        kmemleak_load_module(mod, hdr, sechdrs, secstrings);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+        mod->refptr = alloc_percpu(struct module_ref);
-                                      mod->name);
        if (!mod->refptr) {
                err = -ENOMEM;
                goto free_init;
@@ -2386,6 +2316,12 @@ static noinline struct module *load_module(void __user *umod,
                                         "_ftrace_events",
                                         sizeof(*mod->trace_events),
                                         &mod->num_trace_events);
+        /*
+         * This section contains pointers to allocated objects in the trace
+         * code and not scanning it leads to false positives.
+         */
+        kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
+                           mod->num_trace_events, GFP_KERNEL);
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
@@ -2443,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod,
        sort_extable(mod->extable, mod->extable + mod->num_exentries);
        /* Finally, copy percpu area over. */
-        percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+        percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
                       sechdrs[pcpuindex].sh_size);
        add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2526,8 +2462,8 @@ static noinline struct module *load_module(void __user *umod,
        kobject_put(&mod->mkobj.kobj);
 free_unload:
        module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        percpu_modfree(mod->refptr);
+        free_percpu(mod->refptr);
 free_init:
 #endif
        module_free(mod, mod->module_init);
@@ -2535,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod,
        module_free(mod, mod->module_core);
        /* mod will be freed with core. Don't access it beyond this line! */
 free_percpu:
-        if (percpu)
+        percpu_modfree(mod);
-                percpu_modfree(percpu);
 free_mod:
        kfree(args);
        kfree(strmap);
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a5..57d527a16f9d 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
                                                        \
                DEBUG_LOCKS_WARN_ON(in_interrupt());    \
                local_irq_save(flags);                  \
-                __raw_spin_lock(&(lock)->raw_lock);     \
+                arch_spin_lock(&(lock)->rlock.raw_lock);\
                DEBUG_LOCKS_WARN_ON(l->magic != l);     \
        } while (0)
-#define spin_unlock_mutex(lock, flags)                  \
+#define spin_unlock_mutex(lock, flags)                          \
-        do {                                            \
+        do {                                                    \
-                __raw_spin_unlock(&(lock)->raw_lock);   \
+                arch_spin_unlock(&(lock)->rlock.raw_lock);      \
-                local_irq_restore(flags);               \
+                local_irq_restore(flags);                       \
-                preempt_check_resched();                \
+                preempt_check_resched();                        \
        } while (0)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 947b3ad551f8..632f04c57d82 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        preempt_disable();
        mutex_acquire(&lock->dep_map, subclass, 0, ip);
-#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
-    !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
        /*
         * Optimistic spinning.
         *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;
-        nb = rcu_dereference(*nl);
+        nb = rcu_dereference_raw(*nl);
        while (nb && nr_to_call) {
-                next_nb = rcu_dereference(nb->next);
+                next_nb = rcu_dereference_raw(nb->next);
 #ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
-        if (rcu_dereference(nh->head)) {
+        if (rcu_dereference_raw(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
                                        nr_calls);
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notrace notify_die(enum die_val val, const char *str,
+int notrace __kprobes notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
 {
        struct die_args args = {
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
 *             Pavel Emelianov <xemul@openvz.org>
 */
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
@@ -24,7 +25,18 @@
 static struct kmem_cache *nsproxy_cachep;
-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+struct nsproxy init_nsproxy = {
+        .count  = ATOMIC_INIT(1),
+        .uts_ns = &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+        .ipc_ns = &init_ipc_ns,
+#endif
+        .mnt_ns = NULL,
+        .pid_ns = &init_pid_ns,
+#ifdef CONFIG_NET
+        .net_ns = &init_net,
+#endif
+};
 static inline struct nsproxy *create_nsproxy(void)
 {
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..fd03513c7327
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,697 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rcupdate.h>
+#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_OBJ_NUM 10000 * NR_CPUS
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+        int cpu, target_cpu;
+        target_cpu = cpumask_first(pd->cpumask);
+        for (cpu = 0; cpu < cpu_index; cpu++)
+                target_cpu = cpumask_next(target_cpu, pd->cpumask);
+        return target_cpu;
+}
+static int padata_cpu_hash(struct padata_priv *padata)
+{
+        int cpu_index;
+        struct parallel_data *pd;
+        pd =  padata->pd;
+        /*
+         * Hash the sequence numbers to the cpus by taking
+         * seq_nr mod. number of cpus in use.
+         */
+        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask);
+        return padata_index_to_cpu(pd, cpu_index);
+}
+static void padata_parallel_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        struct padata_instance *pinst;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, pwork);
+        pd = queue->pd;
+        pinst = pd->pinst;
+        spin_lock(&queue->parallel.lock);
+        list_replace_init(&queue->parallel.list, &local_list);
+        spin_unlock(&queue->parallel.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->parallel(padata);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ *          must be in the cpumask of padata.
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+                       struct padata_priv *padata, int cb_cpu)
+{
+        int target_cpu, err;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        rcu_read_lock_bh();
+        pd = rcu_dereference(pinst->pd);
+        err = 0;
+        if (!(pinst->flags & PADATA_INIT))
+                goto out;
+        err =  -EBUSY;
+        if ((pinst->flags & PADATA_RESET))
+                goto out;
+        if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+                goto out;
+        err = -EINVAL;
+        if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
+                goto out;
+        err = -EINPROGRESS;
+        atomic_inc(&pd->refcnt);
+        padata->pd = pd;
+        padata->cb_cpu = cb_cpu;
+        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+                atomic_set(&pd->seq_nr, -1);
+        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+        target_cpu = padata_cpu_hash(padata);
+        queue = per_cpu_ptr(pd->queue, target_cpu);
+        spin_lock(&queue->parallel.lock);
+        list_add_tail(&padata->list, &queue->parallel.list);
+        spin_unlock(&queue->parallel.lock);
+        queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+out:
+        rcu_read_unlock_bh();
+        return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+        int cpu, num_cpus, empty, calc_seq_nr;
+        int seq_nr, next_nr, overrun, next_overrun;
+        struct padata_queue *queue, *next_queue;
+        struct padata_priv *padata;
+        struct padata_list *reorder;
+        empty = 0;
+        next_nr = -1;
+        next_overrun = 0;
+        next_queue = NULL;
+        num_cpus = cpumask_weight(pd->cpumask);
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                reorder = &queue->reorder;
+                /*
+                 * Calculate the seq_nr of the object that should be
+                 * next in this queue.
+                 */
+                overrun = 0;
+                calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
+                               + queue->cpu_index;
+                if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
+                        calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+                        overrun = 1;
+                }
+                if (!list_empty(&reorder->list)) {
+                        padata = list_entry(reorder->list.next,
+                                            struct padata_priv, list);
+                        seq_nr  = padata->seq_nr;
+                        BUG_ON(calc_seq_nr != seq_nr);
+                } else {
+                        seq_nr = calc_seq_nr;
+                        empty++;
+                }
+                if (next_nr < 0 || seq_nr < next_nr
+                    || (next_overrun && !overrun)) {
+                        next_nr = seq_nr;
+                        next_overrun = overrun;
+                        next_queue = queue;
+                }
+        }
+        padata = NULL;
+        if (empty == num_cpus)
+                goto out;
+        reorder = &next_queue->reorder;
+        if (!list_empty(&reorder->list)) {
+                padata = list_entry(reorder->list.next,
+                                    struct padata_priv, list);
+                if (unlikely(next_overrun)) {
+                        for_each_cpu(cpu, pd->cpumask) {
+                                queue = per_cpu_ptr(pd->queue, cpu);
+                                atomic_set(&queue->num_obj, 0);
+                        }
+                }
+                spin_lock(&reorder->lock);
+                list_del_init(&padata->list);
+                atomic_dec(&pd->reorder_objects);
+                spin_unlock(&reorder->lock);
+                atomic_inc(&next_queue->num_obj);
+                goto out;
+        }
+        if (next_nr % num_cpus == next_queue->cpu_index) {
+                padata = ERR_PTR(-ENODATA);
+                goto out;
+        }
+        padata = ERR_PTR(-EINPROGRESS);
+out:
+        return padata;
+}
+static void padata_reorder(struct parallel_data *pd)
+{
+        struct padata_priv *padata;
+        struct padata_queue *queue;
+        struct padata_instance *pinst = pd->pinst;
+try_again:
+        if (!spin_trylock_bh(&pd->lock))
+                goto out;
+        while (1) {
+                padata = padata_get_next(pd);
+                if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+                        break;
+                if (PTR_ERR(padata) == -ENODATA) {
+                        spin_unlock_bh(&pd->lock);
+                        goto out;
+                }
+                queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+                spin_lock(&queue->serial.lock);
+                list_add_tail(&padata->list, &queue->serial.list);
+                spin_unlock(&queue->serial.lock);
+                queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+        }
+        spin_unlock_bh(&pd->lock);
+        if (atomic_read(&pd->reorder_objects))
+                goto try_again;
+out:
+        return;
+}
+static void padata_serial_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, swork);
+        pd = queue->pd;
+        spin_lock(&queue->serial.lock);
+        list_replace_init(&queue->serial.list, &local_list);
+        spin_unlock(&queue->serial.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->serial(padata);
+                atomic_dec(&pd->refcnt);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+        int cpu;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        pd = padata->pd;
+        cpu = get_cpu();
+        queue = per_cpu_ptr(pd->queue, cpu);
+        spin_lock(&queue->reorder.lock);
+        atomic_inc(&pd->reorder_objects);
+        list_add_tail(&padata->list, &queue->reorder.list);
+        spin_unlock(&queue->reorder.lock);
+        put_cpu();
+        padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+                                             const struct cpumask *cpumask)
+{
+        int cpu, cpu_index, num_cpus;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        cpu_index = 0;
+        pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+        if (!pd)
+                goto err;
+        pd->queue = alloc_percpu(struct padata_queue);
+        if (!pd->queue)
+                goto err_free_pd;
+        if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
+                goto err_free_queue;
+        for_each_possible_cpu(cpu) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                queue->pd = pd;
+                if (cpumask_test_cpu(cpu, cpumask)
+                    && cpumask_test_cpu(cpu, cpu_active_mask)) {
+                        queue->cpu_index = cpu_index;
+                        cpu_index++;
+                } else
+                        queue->cpu_index = -1;
+                INIT_LIST_HEAD(&queue->reorder.list);
+                INIT_LIST_HEAD(&queue->parallel.list);
+                INIT_LIST_HEAD(&queue->serial.list);
+                spin_lock_init(&queue->reorder.lock);
+                spin_lock_init(&queue->parallel.lock);
+                spin_lock_init(&queue->serial.lock);
+                INIT_WORK(&queue->pwork, padata_parallel_worker);
+                INIT_WORK(&queue->swork, padata_serial_worker);
+                atomic_set(&queue->num_obj, 0);
+        }
+        cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+        num_cpus = cpumask_weight(pd->cpumask);
+        pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+        atomic_set(&pd->seq_nr, -1);
+        atomic_set(&pd->reorder_objects, 0);
+        atomic_set(&pd->refcnt, 0);
+        pd->pinst = pinst;
+        spin_lock_init(&pd->lock);
+        return pd;
+err_free_queue:
+        free_percpu(pd->queue);
+err_free_pd:
+        kfree(pd);
+err:
+        return NULL;
+}
+static void padata_free_pd(struct parallel_data *pd)
+{
+        free_cpumask_var(pd->cpumask);
+        free_percpu(pd->queue);
+        kfree(pd);
+}
+static void padata_replace(struct padata_instance *pinst,
+                           struct parallel_data *pd_new)
+{
+        struct parallel_data *pd_old = pinst->pd;
+        pinst->flags |= PADATA_RESET;
+        rcu_assign_pointer(pinst->pd, pd_new);
+        synchronize_rcu();
+        while (atomic_read(&pd_old->refcnt) != 0)
+                yield();
+        flush_workqueue(pinst->wq);
+        padata_free_pd(pd_old);
+        pinst->flags &= ~PADATA_RESET;
+}
+/*
+ * padata_set_cpumask - set the cpumask that padata should use
+ *
+ * @pinst: padata instance
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst,
+                        cpumask_var_t cpumask)
+{
+        struct parallel_data *pd;
+        int err = 0;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd) {
+                err = -ENOMEM;
+                goto out;
+        }
+        cpumask_copy(pinst->cpumask, cpumask);
+        padata_replace(pinst, pd);
+out:
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_add_cpu - add a cpu to the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ */
+int padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_set_cpu(cpu, pinst->cpumask);
+        err = __padata_add_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_remove_cpu - remove a cpu from the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_clear_cpu(cpu, pinst->cpumask);
+        err = __padata_remove_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+/*
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+void padata_start(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags |= PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_start);
+/*
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags &= ~PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
+                                         unsigned long action, void *hcpu)
+{
+        int err;
+        struct padata_instance *pinst;
+        int cpu = (unsigned long)hcpu;
+        pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        }
+        return NOTIFY_OK;
+}
+/*
+ * padata_alloc - allocate and initialize a padata instance
+ *
+ * @cpumask: cpumask that padata uses for parallelization
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc(const struct cpumask *cpumask,
+                                     struct workqueue_struct *wq)
+{
+        int err;
+        struct padata_instance *pinst;
+        struct parallel_data *pd;
+        pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+        if (!pinst)
+                goto err;
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd)
+                goto err_free_inst;
+        if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
+                goto err_free_pd;
+        rcu_assign_pointer(pinst->pd, pd);
+        pinst->wq = wq;
+        cpumask_copy(pinst->cpumask, cpumask);
+        pinst->flags = 0;
+        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+        pinst->cpu_notifier.priority = 0;
+        err = register_hotcpu_notifier(&pinst->cpu_notifier);
+        if (err)
+                goto err_free_cpumask;
+        mutex_init(&pinst->lock);
+        return pinst;
+err_free_cpumask:
+        free_cpumask_var(pinst->cpumask);
+err_free_pd:
+        padata_free_pd(pd);
+err_free_inst:
+        kfree(pinst);
+err:
+        return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+/*
+ * padata_free - free a padata instance
+ *
+ * @ padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+        padata_stop(pinst);
+        synchronize_rcu();
+        while (atomic_read(&pinst->pd->refcnt) != 0)
+                yield();
+        unregister_hotcpu_notifier(&pinst->cpu_notifier);
+        padata_free_pd(pinst->pd);
+        free_cpumask_var(pinst->cpumask);
+        kfree(pinst);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 96b45d0b4ba5..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -10,6 +10,7 @@
 */
 #include <linux/debug_locks.h>
 #include <linux/interrupt.h>
+#include <linux/kmsg_dump.h>
 #include <linux/kallsyms.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
@@ -35,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
-static long no_blink(long time)
-{
-        return 0;
-}
 /* Returns how long it waited in ms */
 long (*panic_blink)(long time);
 EXPORT_SYMBOL(panic_blink);
+static void panic_blink_one_second(void)
+{
+        static long i = 0, end;
+        if (panic_blink) {
+                end = i + MSEC_PER_SEC;
+                while (i < end) {
+                        i += panic_blink(i);
+                        mdelay(1);
+                        i++;
+                }
+        } else {
+                /*
+                 * When running under a hypervisor a small mdelay may get
+                 * rounded up to the hypervisor timeslice. For example, with
+                 * a 1ms in 10ms hypervisor timeslice we might inflate a
+                 * mdelay(1) loop by 10x.
+                 *
+                 * If we have nothing to blink, spin on 1 second calls to
+                 * mdelay to avoid this.
+                 */
+                mdelay(MSEC_PER_SEC);
+        }
+}
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -81,6 +103,8 @@ NORET_TYPE void panic(const char * fmt, ...)
         */
        crash_kexec(NULL);
+        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
@@ -92,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
        bust_spinlocks(0);
-        if (!panic_blink)
-                panic_blink = no_blink;
        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
@@ -102,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
                 */
                printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
-                for (i = 0; i < panic_timeout*1000; ) {
+                for (i = 0; i < panic_timeout; i++) {
                        touch_nmi_watchdog();
-                        i += panic_blink(i);
+                        panic_blink_one_second();
-                        mdelay(1);
-                        i++;
                }
                /*
                 * This will not be a clean reboot, with everything
@@ -132,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
        }
 #endif
        local_irq_enable();
-        for (i = 0; ; ) {
+        while (1) {
                touch_softlockup_watchdog();
-                i += panic_blink(i);
+                panic_blink_one_second();
-                mdelay(1);
-                i++;
        }
 }
@@ -339,6 +356,7 @@ void oops_exit(void)
 {
        do_oops_enter_exit();
        print_oops_end_marker();
+        kmsg_dump(KMSG_DUMP_OOPS);
 }
 #ifdef WANT_WARN_ON_SLOWPATH
diff --git a/kernel/params.c b/kernel/params.c
index d656c276508d..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -122,9 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
                next = args + i;
        /* Chew up trailing spaces. */
-        while (isspace(*next))
+        return skip_spaces(next);
-                next++;
-        return next;
 }
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -139,8 +137,7 @@ int parse_args(const char *name,
        DEBUGP("Parsing ARGS: %s\n", args);
        /* Chew leading spaces */
-        while (isspace(*args))
+        args = skip_spaces(args);
-                args++;
        while (*args) {
                int ret;
@@ -404,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 }
 /* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
 extern struct kernel_param __start___param[], __stop___param[];
@@ -423,7 +420,7 @@ struct module_param_attrs
 };
 #ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 static ssize_t param_attr_show(struct module_attribute *mattr,
                               struct module *mod, char *buf)
@@ -519,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
        new->grp.attrs = attrs;
        /* Tack new one on the end. */
+        sysfs_attr_init(&new->attrs[num].mattr.attr);
        new->attrs[num].param = kp;
        new->attrs[num].mattr.show = param_attr_show;
        new->attrs[num].mattr.store = param_attr_store;
@@ -725,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
        return ret;
 }
-static struct sysfs_ops module_sysfs_ops = {
+static const struct sysfs_ops module_sysfs_ops = {
        .show = module_attr_show,
        .store = module_attr_store,
 };
@@ -739,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
        return 0;
 }
-static struct kset_uevent_ops module_uevent_ops = {
+static const struct kset_uevent_ops module_uevent_ops = {
        .filter = uevent_filter,
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7f29643c8985..3d1552d3c12b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,7 @@
 #include <linux/smp.h>
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -28,13 +29,15 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/irq_regs.h>
 /*
 * Each CPU has a list of per CPU events:
 */
-DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
 int perf_max_events __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
@@ -54,21 +57,6 @@ static atomic_t nr_task_events __read_mostly;
 */
 int sysctl_perf_event_paranoid __read_mostly = 1;
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-        return sysctl_perf_event_paranoid > -1;
-}
-static inline bool perf_paranoid_cpu(void)
-{
-        return sysctl_perf_event_paranoid > 0;
-}
-static inline bool perf_paranoid_kernel(void)
-{
-        return sysctl_perf_event_paranoid > 1;
-}
 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
 /*
@@ -94,13 +82,10 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)               { barrier(); }
 void __weak hw_perf_enable(void)                { barrier(); }
-void __weak hw_perf_event_setup(int cpu)        { barrier(); }
-void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx, int cpu)
+               struct perf_event_context *ctx)
 {
        return 0;
 }
@@ -109,25 +94,15 @@ void __weak perf_event_print_debug(void)	{ }
 static DEFINE_PER_CPU(int, perf_disable_count);
-void __perf_disable(void)
-{
-        __get_cpu_var(perf_disable_count)++;
-}
-bool __perf_enable(void)
-{
-        return !--__get_cpu_var(perf_disable_count);
-}
 void perf_disable(void)
 {
-        __perf_disable();
+        if (!__get_cpu_var(perf_disable_count)++)
-        hw_perf_disable();
+                hw_perf_disable();
 }
 void perf_enable(void)
 {
-        if (__perf_enable())
+        if (!--__get_cpu_var(perf_disable_count))
                hw_perf_enable();
 }
@@ -201,14 +176,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
-                spin_lock_irqsave(&ctx->lock, *flags);
+                raw_spin_lock_irqsave(&ctx->lock, *flags);
                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
-                        spin_unlock_irqrestore(&ctx->lock, *flags);
+                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                        goto retry;
                }
                if (!atomic_inc_not_zero(&ctx->refcount)) {
-                        spin_unlock_irqrestore(&ctx->lock, *flags);
+                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
                        ctx = NULL;
                }
        }
@@ -229,7 +204,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
-                spin_unlock_irqrestore(&ctx->lock, flags);
+                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
 }
@@ -238,12 +213,64 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 {
        unsigned long flags;
-        spin_lock_irqsave(&ctx->lock, flags);
+        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
-        spin_unlock_irqrestore(&ctx->lock, flags);
+        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        put_ctx(ctx);
 }
+static inline u64 perf_clock(void)
+{
+        return cpu_clock(raw_smp_processor_id());
+}
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+        u64 now = perf_clock();
+        ctx->time += now - ctx->timestamp;
+        ctx->timestamp = now;
+}
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        u64 run_end;
+        if (event->state < PERF_EVENT_STATE_INACTIVE ||
+            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+                return;
+        if (ctx->is_active)
+                run_end = ctx->time;
+        else
+                run_end = event->tstamp_stopped;
+        event->total_time_enabled = run_end - event->tstamp_enabled;
+        if (event->state == PERF_EVENT_STATE_INACTIVE)
+                run_end = event->tstamp_stopped;
+        else
+                run_end = ctx->time;
+        event->total_time_running = run_end - event->tstamp_running;
+}
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+        if (event->attr.pinned)
+                return &ctx->pinned_groups;
+        else
+                return &ctx->flexible_groups;
+}
 /*
 * Add a event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
@@ -258,9 +285,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         * add it straight to the context's event list, or to the group
         * leader's sibling list:
         */
-        if (group_leader == event)
+        if (group_leader == event) {
-                list_add_tail(&event->group_entry, &ctx->group_list);
+                struct list_head *list;
-        else {
+                if (is_software_event(event))
+                        event->group_flags |= PERF_GROUP_SOFTWARE;
+                list = ctx_group_list(event, ctx);
+                list_add_tail(&event->group_entry, list);
+        } else {
+                if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+                    !is_software_event(event))
+                        group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
                list_add_tail(&event->group_entry, &group_leader->sibling_list);
                group_leader->nr_siblings++;
        }
@@ -292,15 +329,32 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        if (event->group_leader != event)
                event->group_leader->nr_siblings--;
+        update_event_times(event);
+        /*
+         * If event was in error state, then keep it
+         * that way, otherwise bogus counts will be
+         * returned on read(). The only way to get out
+         * of error state is by explicit re-enabling
+         * of the event
+         */
+        if (event->state > PERF_EVENT_STATE_OFF)
+                event->state = PERF_EVENT_STATE_OFF;
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to the context list directly:
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+                struct list_head *list;
-                list_move_tail(&sibling->group_entry, &ctx->group_list);
+                list = ctx_group_list(event, ctx);
+                list_move_tail(&sibling->group_entry, list);
                sibling->group_leader = sibling;
+                /* Inherit group flags from the previous leader */
+                sibling->group_flags = event->group_flags;
        }
 }
@@ -370,7 +424,7 @@ static void __perf_event_remove_from_context(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        /*
         * Protect the list operation against NMI by disabling the
         * events on a global level.
@@ -392,7 +446,7 @@ static void __perf_event_remove_from_context(void *info)
        }
        perf_enable();
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
@@ -419,7 +473,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
        if (!task) {
                /*
                 * Per cpu events are removed via an smp call and
-                 * the removal is always sucessful.
+                 * the removal is always successful.
                 */
                smp_call_function_single(event->cpu,
                                         __perf_event_remove_from_context,
@@ -431,12 +485,12 @@ retry:
        task_oncpu_function_call(task, __perf_event_remove_from_context,
                                 event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * If the context is active we need to retry the smp call.
         */
        if (ctx->nr_active && !list_empty(&event->group_entry)) {
-                spin_unlock_irq(&ctx->lock);
+                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
@@ -445,48 +499,9 @@ retry:
         * can remove the event safely, if the call above did not
         * succeed.
         */
-        if (!list_empty(&event->group_entry)) {
+        if (!list_empty(&event->group_entry))
                list_del_event(event, ctx);
-        }
+        raw_spin_unlock_irq(&ctx->lock);
-        spin_unlock_irq(&ctx->lock);
-}
-static inline u64 perf_clock(void)
-{
-        return cpu_clock(smp_processor_id());
-}
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_event_context *ctx)
-{
-        u64 now = perf_clock();
-        ctx->time += now - ctx->timestamp;
-        ctx->timestamp = now;
-}
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
-        struct perf_event_context *ctx = event->ctx;
-        u64 run_end;
-        if (event->state < PERF_EVENT_STATE_INACTIVE ||
-            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-                return;
-        event->total_time_enabled = ctx->time - event->tstamp_enabled;
-        if (event->state == PERF_EVENT_STATE_INACTIVE)
-                run_end = event->tstamp_stopped;
-        else
-                run_end = ctx->time;
-        event->total_time_running = run_end - event->tstamp_running;
 }
 /*
@@ -517,7 +532,7 @@ static void __perf_event_disable(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        /*
         * If the event is on, turn it off.
@@ -533,7 +548,7 @@ static void __perf_event_disable(void *info)
                event->state = PERF_EVENT_STATE_OFF;
        }
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -549,7 +564,7 @@ static void __perf_event_disable(void *info)
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
-static void perf_event_disable(struct perf_event *event)
+void perf_event_disable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -566,12 +581,12 @@ static void perf_event_disable(struct perf_event *event)
 retry:
        task_oncpu_function_call(task, __perf_event_disable, event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * If the event is still active, we need to retry the cross-call.
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
-                spin_unlock_irq(&ctx->lock);
+                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
@@ -584,20 +599,19 @@ static void perf_event_disable(struct perf_event *event)
                event->state = PERF_EVENT_STATE_OFF;
        }
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
 }
 static int
 event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx,
+                 struct perf_event_context *ctx)
-                 int cpu)
 {
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
        event->state = PERF_EVENT_STATE_ACTIVE;
-        event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
+        event->oncpu = smp_processor_id();
        /*
         * The new state must be visible before we turn it on in the hardware:
         */
@@ -624,8 +638,7 @@ event_sched_in(struct perf_event *event,
 static int
 group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx,
+               struct perf_event_context *ctx)
-               int cpu)
 {
        struct perf_event *event, *partial_group;
        int ret;
@@ -633,18 +646,18 @@ group_sched_in(struct perf_event *group_event,
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
        if (ret)
                return ret < 0 ? ret : 0;
-        if (event_sched_in(group_event, cpuctx, ctx, cpu))
+        if (event_sched_in(group_event, cpuctx, ctx))
                return -EAGAIN;
        /*
         * Schedule in siblings as one group (if any):
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-                if (event_sched_in(event, cpuctx, ctx, cpu)) {
+                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
@@ -668,24 +681,6 @@ group_error:
 }
 /*
- * Return 1 for a group consisting entirely of software events,
- * 0 if the group contains any hardware events.
- */
-static int is_software_only_group(struct perf_event *leader)
-{
-        struct perf_event *event;
-        if (!is_software_event(leader))
-                return 0;
-        list_for_each_entry(event, &leader->sibling_list, group_entry)
-                if (!is_software_event(event))
-                        return 0;
-        return 1;
-}
-/*
 * Work out whether we can put this event group on the CPU now.
 */
 static int group_can_go_on(struct perf_event *event,
@@ -695,7 +690,7 @@ static int group_can_go_on(struct perf_event *event,
        /*
         * Groups consisting entirely of software events can always go on.
         */
-        if (is_software_only_group(event))
+        if (event->group_flags & PERF_GROUP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
@@ -736,7 +731,6 @@ static void __perf_install_in_context(void *info)
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
-        int cpu = smp_processor_id();
        int err;
        /*
@@ -752,7 +746,7 @@ static void __perf_install_in_context(void *info)
                cpuctx->task_ctx = ctx;
        }
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
@@ -764,6 +758,9 @@ static void __perf_install_in_context(void *info)
        add_event_to_ctx(event, ctx);
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                goto unlock;
        /*
         * Don't put the event on if it is disabled or if
         * it is in a group and the group isn't on.
@@ -780,7 +777,7 @@ static void __perf_install_in_context(void *info)
        if (!group_can_go_on(event, cpuctx, 1))
                err = -EEXIST;
        else
-                err = event_sched_in(event, cpuctx, ctx, cpu);
+                err = event_sched_in(event, cpuctx, ctx);
        if (err) {
                /*
@@ -802,7 +799,7 @@ static void __perf_install_in_context(void *info)
 unlock:
        perf_enable();
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -827,7 +824,7 @@ perf_install_in_context(struct perf_event_context *ctx,
        if (!task) {
                /*
                 * Per cpu events are installed via an smp call and
-                 * the install is always sucessful.
+                 * the install is always successful.
                 */
                smp_call_function_single(cpu, __perf_install_in_context,
                                         event, 1);
@@ -838,12 +835,12 @@ retry:
        task_oncpu_function_call(task, __perf_install_in_context,
                                 event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * we need to retry the smp call.
         */
        if (ctx->is_active && list_empty(&event->group_entry)) {
-                spin_unlock_irq(&ctx->lock);
+                raw_spin_unlock_irq(&ctx->lock);
                goto retry;
        }
@@ -854,7 +851,7 @@ retry:
         */
        if (list_empty(&event->group_entry))
                add_event_to_ctx(event, ctx);
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
 }
 /*
@@ -899,7 +896,7 @@ static void __perf_event_enable(void *info)
                cpuctx->task_ctx = ctx;
        }
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        ctx->is_active = 1;
        update_context_time(ctx);
@@ -907,6 +904,9 @@ static void __perf_event_enable(void *info)
                goto unlock;
        __perf_event_mark_enabled(event, ctx);
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                goto unlock;
        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
@@ -919,11 +919,9 @@ static void __perf_event_enable(void *info)
        } else {
                perf_disable();
                if (event == leader)
-                        err = group_sched_in(event, cpuctx, ctx,
+                        err = group_sched_in(event, cpuctx, ctx);
-                                             smp_processor_id());
                else
-                        err = event_sched_in(event, cpuctx, ctx,
+                        err = event_sched_in(event, cpuctx, ctx);
-                                               smp_processor_id());
                perf_enable();
        }
@@ -941,7 +939,7 @@ static void __perf_event_enable(void *info)
        }
 unlock:
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -953,7 +951,7 @@ static void __perf_event_enable(void *info)
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
-static void perf_event_enable(struct perf_event *event)
+void perf_event_enable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -967,7 +965,7 @@ static void perf_event_enable(struct perf_event *event)
                return;
        }
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                goto out;
@@ -982,10 +980,10 @@ static void perf_event_enable(struct perf_event *event)
                event->state = PERF_EVENT_STATE_OFF;
 retry:
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
        task_oncpu_function_call(task, __perf_event_enable, event);
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        /*
         * If the context is active and the event is still off,
@@ -1002,7 +1000,7 @@ static void perf_event_enable(struct perf_event *event)
                __perf_event_mark_enabled(event, ctx);
 out:
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
 }
 static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1019,25 +1017,40 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
-void __perf_event_sched_out(struct perf_event_context *ctx,
+enum event_type_t {
-                              struct perf_cpu_context *cpuctx)
+        EVENT_FLEXIBLE = 0x1,
+        EVENT_PINNED = 0x2,
+        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+static void ctx_sched_out(struct perf_event_context *ctx,
+                          struct perf_cpu_context *cpuctx,
+                          enum event_type_t event_type)
 {
        struct perf_event *event;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        ctx->is_active = 0;
        if (likely(!ctx->nr_events))
                goto out;
        update_context_time(ctx);
        perf_disable();
-        if (ctx->nr_active)
+        if (!ctx->nr_active)
-                list_for_each_entry(event, &ctx->group_list, group_entry)
+                goto out_enable;
+        if (event_type & EVENT_PINNED)
+                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        if (event_type & EVENT_FLEXIBLE)
+                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+                        group_sched_out(event, cpuctx, ctx);
+ out_enable:
        perf_enable();
 out:
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -1059,8 +1072,6 @@ static int context_equiv(struct perf_event_context *ctx1,
                && !ctx1->pin_count && !ctx2->pin_count;
 }
-static void __perf_event_read(void *event);
 static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
 {
@@ -1078,8 +1089,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
         */
        switch (event->state) {
        case PERF_EVENT_STATE_ACTIVE:
-                __perf_event_read(event);
+                event->pmu->read(event);
-                break;
+                /* fall-through */
        case PERF_EVENT_STATE_INACTIVE:
                update_event_times(event);
@@ -1118,6 +1129,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
        if (!ctx->nr_stat)
                return;
+        update_context_time(ctx);
        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);
@@ -1146,23 +1159,19 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 * not restart the event.
 */
 void perf_event_task_sched_out(struct task_struct *task,
-                                 struct task_struct *next, int cpu)
+                                 struct task_struct *next)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
-        struct pt_regs *regs;
        int do_switch = 1;
-        regs = task_pt_regs(task);
+        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
        if (likely(!ctx || !cpuctx->task_ctx))
                return;
-        update_context_time(ctx);
        rcu_read_lock();
        parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_event_ctxp;
@@ -1177,8 +1186,8 @@ void perf_event_task_sched_out(struct task_struct *task,
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
-                spin_lock(&ctx->lock);
+                raw_spin_lock(&ctx->lock);
-                spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {
                        /*
                         * XXX do we need a memory barrier of sorts
@@ -1192,21 +1201,19 @@ void perf_event_task_sched_out(struct task_struct *task,
                        perf_event_sync_stat(ctx, next_ctx);
                }
-                spin_unlock(&next_ctx->lock);
+                raw_spin_unlock(&next_ctx->lock);
-                spin_unlock(&ctx->lock);
+                raw_spin_unlock(&ctx->lock);
        }
        rcu_read_unlock();
        if (do_switch) {
-                __perf_event_sched_out(ctx, cpuctx);
+                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
                cpuctx->task_ctx = NULL;
        }
 }
-/*
+static void task_ctx_sched_out(struct perf_event_context *ctx,
- * Called with IRQs disabled
+                               enum event_type_t event_type)
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 {
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -1216,47 +1223,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
-        __perf_event_sched_out(ctx, cpuctx);
+        ctx_sched_out(ctx, cpuctx, event_type);
        cpuctx->task_ctx = NULL;
 }
 /*
 * Called with IRQs disabled
 */
-static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+{
+        task_ctx_sched_out(ctx, EVENT_ALL);
+}
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+                              enum event_type_t event_type)
 {
-        __perf_event_sched_out(&cpuctx->ctx, cpuctx);
+        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 static void
-__perf_event_sched_in(struct perf_event_context *ctx,
+ctx_pinned_sched_in(struct perf_event_context *ctx,
-                        struct perf_cpu_context *cpuctx, int cpu)
+                    struct perf_cpu_context *cpuctx)
 {
        struct perf_event *event;
-        int can_add_hw = 1;
-        spin_lock(&ctx->lock);
-        ctx->is_active = 1;
-        if (likely(!ctx->nr_events))
-                goto out;
-        ctx->timestamp = perf_clock();
+        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+                if (event->state <= PERF_EVENT_STATE_OFF)
-        perf_disable();
-        /*
-         * First go through the list and put on any pinned groups
-         * in order to give them the best chance of going on.
-         */
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
-                if (event->state <= PERF_EVENT_STATE_OFF ||
-                    !event->attr.pinned)
                        continue;
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, 1))
-                        group_sched_in(event, cpuctx, ctx, cpu);
+                        group_sched_in(event, cpuctx, ctx);
                /*
                 * If this pinned group hasn't been scheduled,
@@ -1267,32 +1268,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
                        event->state = PERF_EVENT_STATE_ERROR;
                }
        }
+}
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
+static void
-                /*
+ctx_flexible_sched_in(struct perf_event_context *ctx,
-                 * Ignore events in OFF or ERROR state, and
+                      struct perf_cpu_context *cpuctx)
-                 * ignore pinned events since we did them already.
+{
-                 */
+        struct perf_event *event;
-                if (event->state <= PERF_EVENT_STATE_OFF ||
+        int can_add_hw = 1;
-                    event->attr.pinned)
-                        continue;
+        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+                /* Ignore events in OFF or ERROR state */
+                if (event->state <= PERF_EVENT_STATE_OFF)
+                        continue;
                /*
                 * Listen to the 'cpu' scheduling filter constraint
                 * of events:
                 */
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, can_add_hw))
-                        if (group_sched_in(event, cpuctx, ctx, cpu))
+                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
        }
+}
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+             struct perf_cpu_context *cpuctx,
+             enum event_type_t event_type)
+{
+        raw_spin_lock(&ctx->lock);
+        ctx->is_active = 1;
+        if (likely(!ctx->nr_events))
+                goto out;
+        ctx->timestamp = perf_clock();
+        perf_disable();
+        /*
+         * First go through the list and put on any pinned groups
+         * in order to give them the best chance of going on.
+         */
+        if (event_type & EVENT_PINNED)
+                ctx_pinned_sched_in(ctx, cpuctx);
+        /* Then walk through the lower prio flexible groups */
+        if (event_type & EVENT_FLEXIBLE)
+                ctx_flexible_sched_in(ctx, cpuctx);
        perf_enable();
 out:
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+                             enum event_type_t event_type)
+{
+        struct perf_event_context *ctx = &cpuctx->ctx;
+        ctx_sched_in(ctx, cpuctx, event_type);
+}
+static void task_ctx_sched_in(struct task_struct *task,
+                              enum event_type_t event_type)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_event_context *ctx = task->perf_event_ctxp;
+        if (likely(!ctx))
+                return;
+        if (cpuctx->task_ctx == ctx)
+                return;
+        ctx_sched_in(ctx, cpuctx, event_type);
+        cpuctx->task_ctx = ctx;
+}
 /*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
@@ -1304,38 +1356,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
-void perf_event_task_sched_in(struct task_struct *task, int cpu)
+void perf_event_task_sched_in(struct task_struct *task)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = task->perf_event_ctxp;
        if (likely(!ctx))
                return;
        if (cpuctx->task_ctx == ctx)
                return;
-        __perf_event_sched_in(ctx, cpuctx, cpu);
+        /*
+         * We want to keep the following priority order:
+         * cpu pinned (that don't need to move), task pinned,
+         * cpu flexible, task flexible.
+         */
+        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
        cpuctx->task_ctx = ctx;
 }
-static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+#define MAX_INTERRUPTS (~0ULL)
+static void perf_log_throttle(struct perf_event *event, int enable);
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
-        struct perf_event_context *ctx = &cpuctx->ctx;
+        u64 frequency = event->attr.sample_freq;
+        u64 sec = NSEC_PER_SEC;
+        u64 divisor, dividend;
+        int count_fls, nsec_fls, frequency_fls, sec_fls;
+        count_fls = fls64(count);
+        nsec_fls = fls64(nsec);
+        frequency_fls = fls64(frequency);
+        sec_fls = 30;
+        /*
+         * We got @count in @nsec, with a target of sample_freq HZ
+         * the target period becomes:
+         *
+         *             @count * 10^9
+         * period = -------------------
+         *          @nsec * sample_freq
+         *
+         */
+        /*
+         * Reduce accuracy by one bit such that @a and @b converge
+         * to a similar magnitude.
+         */
+#define REDUCE_FLS(a, b)                \
+do {                                    \
+        if (a##_fls > b##_fls) {        \
+                a >>= 1;                \
+                a##_fls--;              \
+        } else {                        \
+                b >>= 1;                \
+                b##_fls--;              \
+        }                               \
+} while (0)
+        /*
+         * Reduce accuracy until either term fits in a u64, then proceed with
+         * the other, so that finally we can do a u64/u64 division.
+         */
+        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+                REDUCE_FLS(nsec, frequency);
+                REDUCE_FLS(sec, count);
+        }
-        __perf_event_sched_in(ctx, cpuctx, cpu);
+        if (count_fls + sec_fls > 64) {
+                divisor = nsec * frequency;
+                while (count_fls + sec_fls > 64) {
+                        REDUCE_FLS(count, sec);
+                        divisor >>= 1;
+                }
+                dividend = count * sec;
+        } else {
+                dividend = count * sec;
+                while (nsec_fls + frequency_fls > 64) {
+                        REDUCE_FLS(nsec, frequency);
+                        dividend >>= 1;
+                }
+                divisor = nsec * frequency;
+        }
+        return div64_u64(dividend, divisor);
 }
-#define MAX_INTERRUPTS (~0ULL)
+static void perf_event_stop(struct perf_event *event)
+{
+        if (!event->pmu->stop)
+                return event->pmu->disable(event);
-static void perf_log_throttle(struct perf_event *event, int enable);
+        return event->pmu->stop(event);
+}
+static int perf_event_start(struct perf_event *event)
+{
+        if (!event->pmu->start)
+                return event->pmu->enable(event);
+        return event->pmu->start(event);
+}
-static void perf_adjust_period(struct perf_event *event, u64 events)
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
        u64 period, sample_period;
        s64 delta;
-        events *= hwc->sample_period;
+        period = perf_calculate_period(event, nsec, count);
-        period = div64_u64(events, event->attr.sample_freq);
        delta = (s64)(period - hwc->sample_period);
        delta = (delta + 7) / 8; /* low pass filter */
@@ -1346,19 +1488,31 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
                sample_period = 1;
        hwc->sample_period = sample_period;
+        if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+                perf_disable();
+                perf_event_stop(event);
+                atomic64_set(&hwc->period_left, 0);
+                perf_event_start(event);
+                perf_enable();
+        }
 }
 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
-        u64 interrupts, freq;
+        u64 interrupts, now;
+        s64 delta;
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
+                        continue;
                hwc = &event->hw;
                interrupts = hwc->interrupts;
@@ -1369,47 +1523,25 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
+                        perf_disable();
                        event->pmu->unthrottle(event);
-                        interrupts = 2*sysctl_perf_event_sample_rate/HZ;
+                        perf_enable();
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                /*
+                perf_disable();
-                 * if the specified freq < HZ then we need to skip ticks
+                event->pmu->read(event);
-                 */
+                now = atomic64_read(&event->count);
-                if (event->attr.sample_freq < HZ) {
+                delta = now - hwc->freq_count_stamp;
-                        freq = event->attr.sample_freq;
+                hwc->freq_count_stamp = now;
-                        hwc->freq_count += freq;
-                        hwc->freq_interrupts += interrupts;
-                        if (hwc->freq_count < HZ)
-                                continue;
-                        interrupts = hwc->freq_interrupts;
-                        hwc->freq_interrupts = 0;
-                        hwc->freq_count -= HZ;
-                } else
-                        freq = HZ;
-                perf_adjust_period(event, freq * interrupts);
-                /*
+                if (delta > 0)
-                 * In order to avoid being stalled by an (accidental) huge
+                        perf_adjust_period(event, TICK_NSEC, delta);
-                 * sample period, force reset the sample period if we didn't
+                perf_enable();
-                 * get any events in this freq period.
-                 */
-                if (!interrupts) {
-                        perf_disable();
-                        event->pmu->disable(event);
-                        atomic64_set(&hwc->period_left, 0);
-                        event->pmu->enable(event);
-                        perf_enable();
-                }
        }
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
 /*
@@ -1417,51 +1549,67 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-        struct perf_event *event;
+        raw_spin_lock(&ctx->lock);
-        if (!ctx->nr_events)
+        /* Rotate the first entry last of non-pinned groups */
-                return;
+        list_rotate_left(&ctx->flexible_groups);
-        spin_lock(&ctx->lock);
-        /*
-         * Rotate the first entry last (works just fine for group events too):
-         */
-        perf_disable();
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
-                list_move_tail(&event->group_entry, &ctx->group_list);
-                break;
-        }
-        perf_enable();
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
 }
-void perf_event_task_tick(struct task_struct *curr, int cpu)
+void perf_event_task_tick(struct task_struct *curr)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
+        int rotate = 0;
        if (!atomic_read(&nr_events))
                return;
-        cpuctx = &per_cpu(perf_cpu_context, cpu);
+        cpuctx = &__get_cpu_var(perf_cpu_context);
+        if (cpuctx->ctx.nr_events &&
+            cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                rotate = 1;
        ctx = curr->perf_event_ctxp;
+        if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+                rotate = 1;
        perf_ctx_adjust_freq(&cpuctx->ctx);
        if (ctx)
                perf_ctx_adjust_freq(ctx);
-        perf_event_cpu_sched_out(cpuctx);
+        if (!rotate)
+                return;
+        perf_disable();
+        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                __perf_event_task_sched_out(ctx);
+                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
        rotate_ctx(&cpuctx->ctx);
        if (ctx)
                rotate_ctx(ctx);
-        perf_event_cpu_sched_in(cpuctx, cpu);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                perf_event_task_sched_in(curr, cpu);
+                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+        perf_enable();
+}
+static int event_enable_on_exec(struct perf_event *event,
+                                struct perf_event_context *ctx)
+{
+        if (!event->attr.enable_on_exec)
+                return 0;
+        event->attr.enable_on_exec = 0;
+        if (event->state >= PERF_EVENT_STATE_INACTIVE)
+                return 0;
+        __perf_event_mark_enabled(event, ctx);
+        return 1;
 }
 /*
@@ -1474,6 +1622,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
+        int ret;
        local_irq_save(flags);
        ctx = task->perf_event_ctxp;
@@ -1482,16 +1631,18 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        __perf_event_task_sched_out(ctx);
-        spin_lock(&ctx->lock);
+        raw_spin_lock(&ctx->lock);
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
+        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-                if (!event->attr.enable_on_exec)
+                ret = event_enable_on_exec(event, ctx);
-                        continue;
+                if (ret)
-                event->attr.enable_on_exec = 0;
+                        enabled = 1;
-                if (event->state >= PERF_EVENT_STATE_INACTIVE)
+        }
-                        continue;
-                __perf_event_mark_enabled(event, ctx);
+        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-                enabled = 1;
+                ret = event_enable_on_exec(event, ctx);
+                if (ret)
+                        enabled = 1;
        }
        /*
@@ -1500,9 +1651,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        if (enabled)
                unclone_ctx(ctx);
-        spin_unlock(&ctx->lock);
+        raw_spin_unlock(&ctx->lock);
-        perf_event_task_sched_in(task, smp_processor_id());
+        perf_event_task_sched_in(task);
 out:
        local_irq_restore(flags);
 }
@@ -1515,7 +1666,6 @@ static void __perf_event_read(void *info)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
-        unsigned long flags;
        /*
         * If this is a task context, we need to check whether it is
@@ -1527,12 +1677,12 @@ static void __perf_event_read(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
-        local_irq_save(flags);
+        raw_spin_lock(&ctx->lock);
-        if (ctx->is_active)
+        update_context_time(ctx);
-                update_context_time(ctx);
-        event->pmu->read(event);
        update_event_times(event);
-        local_irq_restore(flags);
+        raw_spin_unlock(&ctx->lock);
+        event->pmu->read(event);
 }
 static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1695,13 @@ static u64 perf_event_read(struct perf_event *event)
                smp_call_function_single(event->oncpu,
                                         __perf_event_read, event, 1);
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+                struct perf_event_context *ctx = event->ctx;
+                unsigned long flags;
+                raw_spin_lock_irqsave(&ctx->lock, flags);
+                update_context_time(ctx);
                update_event_times(event);
+                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return atomic64_read(&event->count);
@@ -1558,10 +1714,10 @@ static void
 __perf_event_init_context(struct perf_event_context *ctx,
                            struct task_struct *task)
 {
-        memset(ctx, 0, sizeof(*ctx));
+        raw_spin_lock_init(&ctx->lock);
-        spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
-        INIT_LIST_HEAD(&ctx->group_list);
+        INIT_LIST_HEAD(&ctx->pinned_groups);
+        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
        ctx->task = task;
@@ -1575,15 +1731,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        unsigned long flags;
        int err;
-        /*
+        if (pid == -1 && cpu != -1) {
-         * If cpu is not a wildcard then this is a percpu event:
-         */
-        if (cpu != -1) {
                /* Must be root to operate on a CPU event: */
                if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
                        return ERR_PTR(-EACCES);
-                if (cpu < 0 || cpu > num_possible_cpus())
+                if (cpu < 0 || cpu >= nr_cpumask_bits)
                        return ERR_PTR(-EINVAL);
                /*
@@ -1591,7 +1744,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
                 * offline CPU and activate it when the CPU comes up, but
                 * that's for later.
                 */
-                if (!cpu_isset(cpu, cpu_online_map))
+                if (!cpu_online(cpu))
                        return ERR_PTR(-ENODEV);
                cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -1629,11 +1782,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                unclone_ctx(ctx);
-                spin_unlock_irqrestore(&ctx->lock, flags);
+                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        if (!ctx) {
-                ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+                ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
@@ -1658,6 +1811,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        return ERR_PTR(err);
 }
+static void perf_event_free_filter(struct perf_event *event);
 static void free_event_rcu(struct rcu_head *head)
 {
        struct perf_event *event;
@@ -1665,6 +1820,7 @@ static void free_event_rcu(struct rcu_head *head)
        event = container_of(head, struct perf_event, rcu_head);
        if (event->ns)
                put_pid_ns(event->ns);
+        perf_event_free_filter(event);
        kfree(event);
 }
@@ -1696,16 +1852,10 @@ static void free_event(struct perf_event *event)
        call_rcu(&event->rcu_head, free_event_rcu);
 }
-/*
+int perf_event_release_kernel(struct perf_event *event)
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
 {
-        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx = event->ctx;
-        file->private_data = NULL;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_event_remove_from_context(event);
@@ -1720,6 +1870,19 @@ static int perf_release(struct inode *inode, struct file *file)
        return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+        struct perf_event *event = file->private_data;
+        file->private_data = NULL;
+        return perf_event_release_kernel(event);
+}
 static int perf_event_read_size(struct perf_event *event)
 {
@@ -1746,91 +1909,94 @@ static int perf_event_read_size(struct perf_event *event)
        return size;
 }
-static u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
        u64 total = 0;
+        *enabled = 0;
+        *running = 0;
+        mutex_lock(&event->child_mutex);
        total += perf_event_read(event);
-        list_for_each_entry(child, &event->child_list, child_list)
+        *enabled += event->total_time_enabled +
+                        atomic64_read(&event->child_total_time_enabled);
+        *running += event->total_time_running +
+                        atomic64_read(&event->child_total_time_running);
+        list_for_each_entry(child, &event->child_list, child_list) {
                total += perf_event_read(child);
+                *enabled += child->total_time_enabled;
+                *running += child->total_time_running;
+        }
+        mutex_unlock(&event->child_mutex);
        return total;
 }
+EXPORT_SYMBOL_GPL(perf_event_read_value);
-static int perf_event_read_entry(struct perf_event *event,
-                                   u64 read_format, char __user *buf)
-{
-        int n = 0, count = 0;
-        u64 values[2];
-        values[n++] = perf_event_read_value(event);
-        if (read_format & PERF_FORMAT_ID)
-                values[n++] = primary_event_id(event);
-        count = n * sizeof(u64);
-        if (copy_to_user(buf, values, count))
-                return -EFAULT;
-        return count;
-}
 static int perf_event_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
 {
        struct perf_event *leader = event->group_leader, *sub;
-        int n = 0, size = 0, err = -EFAULT;
+        int n = 0, size = 0, ret = -EFAULT;
-        u64 values[3];
+        struct perf_event_context *ctx = leader->ctx;
+        u64 values[5];
+        u64 count, enabled, running;
+        mutex_lock(&ctx->mutex);
+        count = perf_event_read_value(leader, &enabled, &running);
        values[n++] = 1 + leader->nr_siblings;
-        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                values[n++] = leader->total_time_enabled +
+                values[n++] = enabled;
-                        atomic64_read(&leader->child_total_time_enabled);
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-        }
+                values[n++] = running;
-        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+        values[n++] = count;
-                values[n++] = leader->total_time_running +
+        if (read_format & PERF_FORMAT_ID)
-                        atomic64_read(&leader->child_total_time_running);
+                values[n++] = primary_event_id(leader);
-        }
        size = n * sizeof(u64);
        if (copy_to_user(buf, values, size))
-                return -EFAULT;
+                goto unlock;
-        err = perf_event_read_entry(leader, read_format, buf + size);
-        if (err < 0)
-                return err;
-        size += err;
+        ret = size;
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-                err = perf_event_read_entry(sub, read_format,
+                n = 0;
-                                buf + size);
-                if (err < 0)
+                values[n++] = perf_event_read_value(sub, &enabled, &running);
-                        return err;
+                if (read_format & PERF_FORMAT_ID)
+                        values[n++] = primary_event_id(sub);
-                size += err;
+                size = n * sizeof(u64);
+                if (copy_to_user(buf + ret, values, size)) {
+                        ret = -EFAULT;
+                        goto unlock;
+                }
+                ret += size;
        }
+unlock:
+        mutex_unlock(&ctx->mutex);
-        return size;
+        return ret;
 }
 static int perf_event_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
 {
+        u64 enabled, running;
        u64 values[4];
        int n = 0;
-        values[n++] = perf_event_read_value(event);
+        values[n++] = perf_event_read_value(event, &enabled, &running);
-        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                values[n++] = event->total_time_enabled +
+                values[n++] = enabled;
-                        atomic64_read(&event->child_total_time_enabled);
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-        }
+                values[n++] = running;
-        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-                values[n++] = event->total_time_running +
-                        atomic64_read(&event->child_total_time_running);
-        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
@@ -1861,12 +2027,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
                return -ENOSPC;
        WARN_ON_ONCE(event->ctx->parent_ctx);
-        mutex_lock(&event->child_mutex);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_event_read_group(event, read_format, buf);
        else
                ret = perf_event_read_one(event, read_format, buf);
-        mutex_unlock(&event->child_mutex);
        return ret;
 }
@@ -1956,7 +2120,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        if (!value)
                return -EINVAL;
-        spin_lock_irq(&ctx->lock);
+        raw_spin_lock_irq(&ctx->lock);
        if (event->attr.freq) {
                if (value > sysctl_perf_event_sample_rate) {
                        ret = -EINVAL;
@@ -1969,12 +2133,13 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
                event->hw.sample_period = value;
        }
 unlock:
-        spin_unlock_irq(&ctx->lock);
+        raw_spin_unlock_irq(&ctx->lock);
        return ret;
 }
-int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -2002,6 +2167,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_OUTPUT:
                return perf_event_set_output(event, arg);
+        case PERF_EVENT_IOC_SET_FILTER:
+                return perf_event_set_filter(event, (void __user *)arg);
        default:
                return -ENOTTY;
        }
@@ -2174,6 +2342,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
        perf_mmap_free_page((unsigned long)data->user_page);
        for (i = 0; i < data->nr_pages; i++)
                perf_mmap_free_page((unsigned long)data->data_pages[i]);
+        kfree(data);
 }
 #else
@@ -2214,6 +2383,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
        vfree(base);
+        kfree(data);
 }
 static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2477,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
        }
        if (!data->watermark)
-                data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
+                data->watermark = max_size / 2;
        rcu_assign_pointer(event->data, data);
@@ -2319,7 +2489,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
        data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
        perf_mmap_data_free(data);
-        kfree(data);
 }
 static void perf_mmap_data_release(struct perf_event *event)
@@ -2420,7 +2589,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        if (user_locked > user_lock_limit)
                extra = user_locked - user_lock_limit;
-        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = vma->vm_mm->locked_vm + extra;
@@ -2616,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        return NULL;
 }
+__weak
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
+{
+}
 /*
 * Output
 */
@@ -2666,20 +2841,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 static void perf_output_lock(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-        int cpu;
+        int cur, cpu = get_cpu();
        handle->locked = 0;
-        local_irq_save(handle->flags);
+        for (;;) {
-        cpu = smp_processor_id();
+                cur = atomic_cmpxchg(&data->lock, -1, cpu);
+                if (cur == -1) {
-        if (in_nmi() && atomic_read(&data->lock) == cpu)
+                        handle->locked = 1;
-                return;
+                        break;
+                }
+                if (cur == cpu)
+                        break;
-        while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
                cpu_relax();
+        }
-        handle->locked = 1;
 }
 static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2901,7 @@ again:
        if (atomic_xchg(&data->wakeup, 0))
                perf_output_wakeup(handle);
 out:
-        local_irq_restore(handle->flags);
+        put_cpu();
 }
 void perf_output_copy(struct perf_output_handle *handle,
@@ -3200,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-        int size;
        struct task_struct *task = task_event->task;
-        int ret;
+        unsigned long flags;
+        int size, ret;
+        /*
+         * If this CPU attempts to acquire an rq lock held by a CPU spinning
+         * in perf_output_lock() from interrupt context, it's game over.
+         */
+        local_irq_save(flags);
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
-        if (ret)
+        if (ret) {
+                local_irq_restore(flags);
                return;
+        }
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3216,15 +3400,20 @@ static void perf_event_task_output(struct perf_event *event,
        task_event->event_id.tid = perf_event_tid(event, task);
        task_event->event_id.ptid = perf_event_tid(event, current);
-        task_event->event_id.time = perf_clock();
        perf_output_put(&handle, task_event->event_id);
        perf_output_end(&handle);
+        local_irq_restore(flags);
 }
 static int perf_event_task_match(struct perf_event *event)
 {
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
+                return 0;
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (event->attr.comm || event->attr.mmap || event->attr.task)
                return 1;
@@ -3236,15 +3425,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_task_match(event))
                        perf_event_task_output(event, task_event);
        }
-        rcu_read_unlock();
 }
 static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,15 +3436,14 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx = task_event->task_ctx;
+        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_task_ctx(&cpuctx->ctx, task_event);
-        put_cpu_var(perf_cpu_context);
-        rcu_read_lock();
        if (!ctx)
-                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
+                ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_task_ctx(ctx, task_event);
+        put_cpu_var(perf_cpu_context);
        rcu_read_unlock();
 }
@@ -3288,6 +3471,7 @@ static void perf_event_task(struct task_struct *task,
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
+                        .time = perf_clock(),
                },
        };
@@ -3337,6 +3521,12 @@ static void perf_event_comm_output(struct perf_event *event,
 static int perf_event_comm_match(struct perf_event *event)
 {
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
+                return 0;
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (event->attr.comm)
                return 1;
@@ -3348,15 +3538,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_comm_match(event))
                        perf_event_comm_output(event, comm_event);
        }
-        rcu_read_unlock();
 }
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3552,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        char comm[TASK_COMM_LEN];
        memset(comm, 0, sizeof(comm));
-        strncpy(comm, comm_event->task->comm, sizeof(comm));
+        strlcpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));
        comm_event->comm = comm;
@@ -3375,18 +3560,13 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
-        put_cpu_var(perf_cpu_context);
-        rcu_read_lock();
-        /*
-         * doesn't really matter which of the child contexts the
-         * events ends up in.
-         */
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_comm_ctx(ctx, comm_event);
+        put_cpu_var(perf_cpu_context);
        rcu_read_unlock();
 }
@@ -3461,6 +3641,12 @@ static void perf_event_mmap_output(struct perf_event *event,
 static int perf_event_mmap_match(struct perf_event *event,
                                   struct perf_mmap_event *mmap_event)
 {
+        if (event->state < PERF_EVENT_STATE_INACTIVE)
+                return 0;
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (event->attr.mmap)
                return 1;
@@ -3472,15 +3658,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_mmap_match(event, mmap_event))
                        perf_event_mmap_output(event, mmap_event);
        }
-        rcu_read_unlock();
 }
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,18 +3717,13 @@ got_name:
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
-        put_cpu_var(perf_cpu_context);
-        rcu_read_lock();
-        /*
-         * doesn't really matter which of the child contexts the
-         * events ends up in.
-         */
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_event_mmap_ctx(ctx, mmap_event);
+        put_cpu_var(perf_cpu_context);
        rcu_read_unlock();
        kfree(buf);
@@ -3574,7 +3750,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
-                        .pgoff  = vma->vm_pgoff,
+                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
        };
@@ -3654,12 +3830,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        if (event->attr.freq) {
                u64 now = perf_clock();
-                s64 delta = now - hwc->freq_stamp;
+                s64 delta = now - hwc->freq_time_stamp;
-                hwc->freq_stamp = now;
+                hwc->freq_time_stamp = now;
-                if (delta > 0 && delta < TICK_NSEC)
+                if (delta > 0 && delta < 2*TICK_NSEC)
-                        perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+                        perf_adjust_period(event, delta, hwc->last_period);
        }
        /*
@@ -3679,7 +3855,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
                        perf_event_disable(event);
        }
-        perf_event_output(event, nmi, data, regs);
+        if (event->overflow_handler)
+                event->overflow_handler(event, nmi, data, regs);
+        else
+                perf_event_output(event, nmi, data, regs);
        return ret;
 }
@@ -3724,16 +3904,16 @@ again:
        return nr;
 }
-static void perf_swevent_overflow(struct perf_event *event,
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    int nmi, struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
-        u64 overflow;
        data->period = event->hw.last_period;
-        overflow = perf_swevent_set_period(event);
+        if (!overflow)
+                overflow = perf_swevent_set_period(event);
        if (hwc->interrupts == MAX_INTERRUPTS)
                return;
@@ -3766,14 +3946,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        atomic64_add(nr, &event->count);
+        if (!regs)
+                return;
        if (!hwc->sample_period)
                return;
-        if (!regs)
+        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+                return perf_swevent_overflow(event, 1, nmi, data, regs);
+        if (atomic64_add_negative(nr, &hwc->period_left))
                return;
-        if (!atomic64_add_negative(nr, &hwc->period_left))
+        perf_swevent_overflow(event, 0, nmi, data, regs);
-                perf_swevent_overflow(event, nmi, data, regs);
 }
 static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3991,47 @@ static int perf_swevent_is_counting(struct perf_event *event)
        return 1;
 }
+static int perf_tp_event_match(struct perf_event *event,
+                                struct perf_sample_data *data);
+static int perf_exclude_event(struct perf_event *event,
+                              struct pt_regs *regs)
+{
+        if (regs) {
+                if (event->attr.exclude_user && user_mode(regs))
+                        return 1;
+                if (event->attr.exclude_kernel && !user_mode(regs))
+                        return 1;
+        }
+        return 0;
+}
 static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
-                                u32 event_id, struct pt_regs *regs)
+                                u32 event_id,
+                                struct perf_sample_data *data,
+                                struct pt_regs *regs)
 {
+        if (event->cpu != -1 && event->cpu != smp_processor_id())
+                return 0;
        if (!perf_swevent_is_counting(event))
                return 0;
        if (event->attr.type != type)
                return 0;
        if (event->attr.config != event_id)
                return 0;
-        if (regs) {
+        if (perf_exclude_event(event, regs))
-                if (event->attr.exclude_user && user_mode(regs))
+                return 0;
-                        return 0;
-                if (event->attr.exclude_kernel && !user_mode(regs))
+        if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-                        return 0;
+            !perf_tp_event_match(event, data))
-        }
+                return 0;
        return 1;
 }
@@ -3837,49 +4044,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-                if (perf_swevent_match(event, type, event_id, regs))
+                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
-        rcu_read_unlock();
 }
-static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+int perf_swevent_get_recursion_context(void)
 {
+        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+        int rctx;
        if (in_nmi())
-                return &cpuctx->recursion[3];
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (cpuctx->recursion[rctx]) {
+                put_cpu_var(perf_cpu_context);
+                return -1;
+        }
-        if (in_irq())
+        cpuctx->recursion[rctx]++;
-                return &cpuctx->recursion[2];
+        barrier();
-        if (in_softirq())
+        return rctx;
-                return &cpuctx->recursion[1];
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-        return &cpuctx->recursion[0];
+void perf_swevent_put_recursion_context(int rctx)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        barrier();
+        cpuctx->recursion[rctx]--;
+        put_cpu_var(perf_cpu_context);
 }
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr, int nmi,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
-        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx;
-        int *recursion = perf_swevent_recursion_context(cpuctx);
        struct perf_event_context *ctx;
-        if (*recursion)
+        cpuctx = &__get_cpu_var(perf_cpu_context);
-                goto out;
+        rcu_read_lock();
-        (*recursion)++;
-        barrier();
        perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
                                 nr, nmi, data, regs);
-        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         * events ends up in.
@@ -3888,23 +4105,23 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        if (ctx)
                perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
        rcu_read_unlock();
-        barrier();
-        (*recursion)--;
-out:
-        put_cpu_var(perf_cpu_context);
 }
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
 {
-        struct perf_sample_data data = {
+        struct perf_sample_data data;
-                .addr = addr,
+        int rctx;
-        };
-        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
+        rctx = perf_swevent_get_recursion_context();
-                                &data, regs);
+        if (rctx < 0)
+                return;
+        perf_sample_data_init(&data, addr);
+        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+        perf_swevent_put_recursion_context(rctx);
 }
 static void perf_swevent_read(struct perf_event *event)
@@ -3945,10 +4162,11 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        struct perf_event *event;
        u64 period;
-        event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
+        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
        event->pmu->read(event);
-        data.addr = 0;
+        perf_sample_data_init(&data, 0);
+        data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
         * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4017,8 +4235,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
        u64 now;
        now = cpu_clock(cpu);
-        prev = atomic64_read(&event->hw.prev_count);
+        prev = atomic64_xchg(&event->hw.prev_count, now);
-        atomic64_set(&event->hw.prev_count, now);
        atomic64_add(now - prev, &event->count);
 }
@@ -4107,36 +4324,39 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_EVENT_TRACING
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-                          int entry_size)
+                   int entry_size, struct pt_regs *regs)
 {
+        struct perf_sample_data data;
        struct perf_raw_record raw = {
                .size = entry_size,
                .data = record,
        };
-        struct perf_sample_data data = {
+        perf_sample_data_init(&data, addr);
-                .addr = addr,
+        data.raw = &raw;
-                .raw = &raw,
-        };
-        struct pt_regs *regs = get_irq_regs();
-        if (!regs)
-                regs = task_pt_regs(current);
+        /* Trace events already protected against recursion */
        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-                                &data, regs);
+                         &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
-extern int ftrace_profile_enable(int);
+static int perf_tp_event_match(struct perf_event *event,
-extern void ftrace_profile_disable(int);
+                                struct perf_sample_data *data)
+{
+        void *record = data->raw->data;
+        if (likely(!event->filter) || filter_match_preds(event->filter, record))
+                return 1;
+        return 0;
+}
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-        ftrace_profile_disable(event->attr.config);
+        perf_trace_disable(event->attr.config);
 }
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4150,18 +4370,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
-        if (ftrace_profile_enable(event->attr.config))
+        if (perf_trace_enable(event->attr.config))
                return NULL;
        event->destroy = tp_perf_event_destroy;
        return &perf_ops_generic;
 }
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+        char *filter_str;
+        int ret;
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                return -EINVAL;
+        filter_str = strndup_user(arg, PAGE_SIZE);
+        if (IS_ERR(filter_str))
+                return PTR_ERR(filter_str);
+        ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+        kfree(filter_str);
+        return ret;
+}
+static void perf_event_free_filter(struct perf_event *event)
+{
+        ftrace_profile_free_filter(event);
+}
 #else
+static int perf_tp_event_match(struct perf_event *event,
+                                struct perf_sample_data *data)
+{
+        return 1;
+}
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
        return NULL;
 }
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+        return -ENOENT;
+}
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+#endif /* CONFIG_EVENT_TRACING */
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+        release_bp_slot(event);
+}
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+        int err;
+        err = register_perf_hw_breakpoint(bp);
+        if (err)
+                return ERR_PTR(err);
+        bp->destroy = bp_perf_event_destroy;
+        return &perf_ops_bp;
+}
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+        struct perf_sample_data sample;
+        struct pt_regs *regs = data;
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
+        if (!perf_exclude_event(bp, regs))
+                perf_swevent_add(bp, 1, 1, &sample, regs);
+}
+#else
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+        return NULL;
+}
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+}
 #endif
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4208,6 +4509,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
        case PERF_COUNT_SW_CONTEXT_SWITCHES:
        case PERF_COUNT_SW_CPU_MIGRATIONS:
+        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+        case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
@@ -4228,6 +4531,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                   struct perf_event_context *ctx,
                   struct perf_event *group_leader,
                   struct perf_event *parent_event,
+                   perf_overflow_handler_t overflow_handler,
                   gfp_t gfpflags)
 {
        const struct pmu *pmu;
@@ -4270,6 +4574,11 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
+        if (!overflow_handler && parent_event)
+                overflow_handler = parent_event->overflow_handler;
+        
+        event->overflow_handler = overflow_handler;
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
@@ -4304,6 +4613,11 @@ perf_event_alloc(struct perf_event_attr *attr,
                pmu = tp_perf_event_init(event);
                break;
+        case PERF_TYPE_BREAKPOINT:
+                pmu = bp_perf_event_init(event);
+                break;
        default:
                break;
        }
@@ -4398,7 +4712,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->type >= PERF_TYPE_MAX)
                return -EINVAL;
-        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
+        if (attr->__reserved_1)
                return -EINVAL;
        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4416,7 +4730,7 @@ err_size:
        goto out;
 }
-int perf_event_set_output(struct perf_event *event, int output_fd)
+static int perf_event_set_output(struct perf_event *event, int output_fd)
 {
        struct perf_event *output_event = NULL;
        struct file *output_file = NULL;
@@ -4546,12 +4860,12 @@ SYSCALL_DEFINE5(perf_event_open,
        }
        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                     NULL, GFP_KERNEL);
+                                     NULL, NULL, GFP_KERNEL);
        err = PTR_ERR(event);
        if (IS_ERR(event))
                goto err_put_context;
-        err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
+        err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
        if (err < 0)
                goto err_free_put_context;
@@ -4583,7 +4897,7 @@ err_fput_free_put_context:
 err_free_put_context:
        if (err < 0)
-                kfree(event);
+                free_event(event);
 err_put_context:
        if (err < 0)
@@ -4594,6 +4908,61 @@ err_put_context:
        return err;
 }
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @pid: task to profile
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+                                 pid_t pid,
+                                 perf_overflow_handler_t overflow_handler)
+{
+        struct perf_event *event;
+        struct perf_event_context *ctx;
+        int err;
+        /*
+         * Get the target context (task or percpu):
+         */
+        ctx = find_get_context(pid, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_exit;
+        }
+        event = perf_event_alloc(attr, cpu, ctx, NULL,
+                                 NULL, overflow_handler, GFP_KERNEL);
+        if (IS_ERR(event)) {
+                err = PTR_ERR(event);
+                goto err_put_context;
+        }
+        event->filp = NULL;
+        WARN_ON_ONCE(ctx->parent_ctx);
+        mutex_lock(&ctx->mutex);
+        perf_install_in_context(ctx, event, cpu);
+        ++ctx->generation;
+        mutex_unlock(&ctx->mutex);
+        event->owner = current;
+        get_task_struct(current);
+        mutex_lock(&current->perf_event_mutex);
+        list_add_tail(&event->owner_entry, &current->perf_event_list);
+        mutex_unlock(&current->perf_event_mutex);
+        return event;
+ err_put_context:
+        put_ctx(ctx);
+ err_exit:
+        return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 /*
 * inherit a event from parent task to child task:
 */
@@ -4619,7 +4988,7 @@ inherit_event(struct perf_event *parent_event,
        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu, child_ctx,
                                           group_leader, parent_event,
-                                           GFP_KERNEL);
+                                           NULL, GFP_KERNEL);
        if (IS_ERR(child_event))
                return child_event;
        get_ctx(child_ctx);
@@ -4634,8 +5003,17 @@ inherit_event(struct perf_event *parent_event,
        else
                child_event->state = PERF_EVENT_STATE_OFF;
-        if (parent_event->attr.freq)
+        if (parent_event->attr.freq) {
-                child_event->hw.sample_period = parent_event->hw.sample_period;
+                u64 sample_period = parent_event->hw.sample_period;
+                struct hw_perf_event *hwc = &child_event->hw;
+                hwc->sample_period = sample_period;
+                hwc->last_period   = sample_period;
+                atomic64_set(&hwc->period_left, sample_period);
+        }
+        child_event->overflow_handler = parent_event->overflow_handler;
        /*
         * Link it up in the child's context:
@@ -4726,7 +5104,6 @@ __perf_event_exit_task(struct perf_event *child_event,
 {
        struct perf_event *parent_event;
-        update_event_times(child_event);
        perf_event_remove_from_context(child_event);
        parent_event = child_event->parent;
@@ -4770,7 +5147,7 @@ void perf_event_exit_task(struct task_struct *child)
         * reading child->perf_event_ctxp, we wait until it has
         * incremented the context's refcount before we do put_ctx below.
         */
-        spin_lock(&child_ctx->lock);
+        raw_spin_lock(&child_ctx->lock);
        child->perf_event_ctxp = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
@@ -4778,7 +5155,8 @@ void perf_event_exit_task(struct task_struct *child)
         * the events from it.
         */
        unclone_ctx(child_ctx);
-        spin_unlock_irqrestore(&child_ctx->lock, flags);
+        update_context_time(child_ctx);
+        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
         * Report the task dead after unscheduling the events so that we
@@ -4801,7 +5179,11 @@ void perf_event_exit_task(struct task_struct *child)
        mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 again:
-        list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+        list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
+                                 group_entry)
+                __perf_event_exit_task(child_event, child_ctx, child);
+        list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
                                 group_entry)
                __perf_event_exit_task(child_event, child_ctx, child);
@@ -4810,7 +5192,8 @@ again:
         * its siblings to the list, but we obtained 'tmp' before that which
         * will still point to the list head terminating the iteration.
         */
-        if (!list_empty(&child_ctx->group_list))
+        if (!list_empty(&child_ctx->pinned_groups) ||
+            !list_empty(&child_ctx->flexible_groups))
                goto again;
        mutex_unlock(&child_ctx->mutex);
@@ -4818,6 +5201,24 @@ again:
        put_ctx(child_ctx);
 }
+static void perf_free_event(struct perf_event *event,
+                            struct perf_event_context *ctx)
+{
+        struct perf_event *parent = event->parent;
+        if (WARN_ON_ONCE(!parent))
+                return;
+        mutex_lock(&parent->child_mutex);
+        list_del_init(&event->child_list);
+        mutex_unlock(&parent->child_mutex);
+        fput(parent->filp);
+        list_del_event(event, ctx);
+        free_event(event);
+}
 /*
 * free an unexposed, unused context as created by inheritance by
 * init_task below, used by fork() in case of fail.
@@ -4832,30 +5233,64 @@ void perf_event_free_task(struct task_struct *task)
        mutex_lock(&ctx->mutex);
 again:
-        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
+        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-                struct perf_event *parent = event->parent;
+                perf_free_event(event, ctx);
-                if (WARN_ON_ONCE(!parent))
+        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                        continue;
+                                 group_entry)
+                perf_free_event(event, ctx);
-                mutex_lock(&parent->child_mutex);
+        if (!list_empty(&ctx->pinned_groups) ||
-                list_del_init(&event->child_list);
+            !list_empty(&ctx->flexible_groups))
-                mutex_unlock(&parent->child_mutex);
+                goto again;
-                fput(parent->filp);
+        mutex_unlock(&ctx->mutex);
-                list_del_event(event, ctx);
+        put_ctx(ctx);
-                free_event(event);
+}
+static int
+inherit_task_group(struct perf_event *event, struct task_struct *parent,
+                   struct perf_event_context *parent_ctx,
+                   struct task_struct *child,
+                   int *inherited_all)
+{
+        int ret;
+        struct perf_event_context *child_ctx = child->perf_event_ctxp;
+        if (!event->attr.inherit) {
+                *inherited_all = 0;
+                return 0;
        }
-        if (!list_empty(&ctx->group_list))
+        if (!child_ctx) {
-                goto again;
+                /*
+                 * This is executed from the parent task context, so
+                 * inherit events that have been marked for cloning.
+                 * First allocate and initialize a context for the
+                 * child.
+                 */
-        mutex_unlock(&ctx->mutex);
+                child_ctx = kzalloc(sizeof(struct perf_event_context),
+                                    GFP_KERNEL);
+                if (!child_ctx)
+                        return -ENOMEM;
-        put_ctx(ctx);
+                __perf_event_init_context(child_ctx, child);
+                child->perf_event_ctxp = child_ctx;
+                get_task_struct(child);
+        }
+        ret = inherit_group(event, parent, parent_ctx,
+                            child, child_ctx);
+        if (ret)
+                *inherited_all = 0;
+        return ret;
 }
 /*
 * Initialize the perf_event context in task_struct
 */
@@ -4877,20 +5312,6 @@ int perf_event_init_task(struct task_struct *child)
                return 0;
        /*
-         * This is executed from the parent task context, so inherit
-         * events that have been marked for cloning.
-         * First allocate and initialize a context for the child.
-         */
-        child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
-        if (!child_ctx)
-                return -ENOMEM;
-        __perf_event_init_context(child_ctx, child);
-        child->perf_event_ctxp = child_ctx;
-        get_task_struct(child);
-        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
@@ -4913,22 +5334,23 @@ int perf_event_init_task(struct task_struct *child)
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
-        list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
+        list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+                ret = inherit_task_group(event, parent, parent_ctx, child,
-                if (!event->attr.inherit) {
+                                         &inherited_all);
-                        inherited_all = 0;
+                if (ret)
-                        continue;
+                        break;
-                }
+        }
-                ret = inherit_group(event, parent, parent_ctx,
+        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-                                             child, child_ctx);
+                ret = inherit_task_group(event, parent, parent_ctx, child,
-                if (ret) {
+                                         &inherited_all);
-                        inherited_all = 0;
+                if (ret)
                        break;
-                }
        }
-        if (inherited_all) {
+        child_ctx = child->perf_event_ctxp;
+        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
@@ -4955,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child)
        return ret;
 }
+static void __init perf_event_init_all_cpus(void)
+{
+        int cpu;
+        struct perf_cpu_context *cpuctx;
+        for_each_possible_cpu(cpu) {
+                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                __perf_event_init_context(&cpuctx->ctx, NULL);
+        }
+}
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
        struct perf_cpu_context *cpuctx;
        cpuctx = &per_cpu(perf_cpu_context, cpu);
-        __perf_event_init_context(&cpuctx->ctx, NULL);
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
-        hw_perf_event_setup(cpu);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -4976,7 +5406,9 @@ static void __perf_event_exit_cpu(void *info)
        struct perf_event_context *ctx = &cpuctx->ctx;
        struct perf_event *event, *tmp;
-        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+                __perf_event_remove_from_context(event);
+        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                __perf_event_remove_from_context(event);
 }
 static void perf_event_exit_cpu(int cpu)
@@ -5004,11 +5436,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
                perf_event_init_cpu(cpu);
                break;
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                hw_perf_event_setup_online(cpu);
-                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                perf_event_exit_cpu(cpu);
@@ -5031,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 void __init perf_event_init(void)
 {
+        perf_event_init_all_cpus();
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5038,13 +5466,16 @@ void __init perf_event_init(void)
        register_cpu_notifier(&perf_cpu_nb);
 }
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
+                                        struct sysdev_class_attribute *attr,
+                                        char *buf)
 {
        return sprintf(buf, "%d\n", perf_reserved_percpu);
 }
 static ssize_t
 perf_set_reserve_percpu(struct sysdev_class *class,
+                        struct sysdev_class_attribute *attr,
                        const char *buf,
                        size_t count)
 {
@@ -5062,24 +5493,28 @@ perf_set_reserve_percpu(struct sysdev_class *class,
        perf_reserved_percpu = val;
        for_each_online_cpu(cpu) {
                cpuctx = &per_cpu(perf_cpu_context, cpu);
-                spin_lock_irq(&cpuctx->ctx.lock);
+                raw_spin_lock_irq(&cpuctx->ctx.lock);
                mpt = min(perf_max_events - cpuctx->ctx.nr_events,
                          perf_max_events - perf_reserved_percpu);
                cpuctx->max_pertask = mpt;
-                spin_unlock_irq(&cpuctx->ctx.lock);
+                raw_spin_unlock_irq(&cpuctx->ctx.lock);
        }
        spin_unlock(&perf_resource_lock);
        return count;
 }
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_overcommit(struct sysdev_class *class,
+                                    struct sysdev_class_attribute *attr,
+                                    char *buf)
 {
        return sprintf(buf, "%d\n", perf_overcommit);
 }
 static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+perf_set_overcommit(struct sysdev_class *class,
+                    struct sysdev_class_attribute *attr,
+                    const char *buf, size_t count)
 {
        unsigned long val;
        int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9c..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
                         * installing it:
                         */
                        spin_lock_irq(&pidmap_lock);
-                        if (map->page)
+                        if (!map->page) {
-                                kfree(page);
-                        else
                                map->page = page;
+                                page = NULL;
+                        }
                        spin_unlock_irq(&pidmap_lock);
+                        kfree(page);
                        if (unlikely(!map->page))
                                break;
                }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
+        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
-        for (i = ns->level; i >= 0; i--) {
+        for ( ; upid >= pid->numbers; --upid)
-                upid = &pid->numbers[i];
                hlist_add_head_rcu(&upid->pid_chain,
                                &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
-        }
        spin_unlock_irq(&pidmap_lock);
 out:
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
-                first = rcu_dereference(pid->tasks[type].first);
+                first = rcu_dereference_check(pid->tasks[type].first,
+                                              rcu_read_lock_held() ||
+                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
        }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 EXPORT_SYMBOL(pid_task);
 /*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ * Must be called under rcu_read_lock().
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
+#include <linux/slab.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rcu_read_lock();
                /*
-                 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+                 * Any nested-container's init processes won't ignore the
-                 * any nested-container's init processes don't ignore the
+                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-                 * signal
                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
                if (task)
-                        force_sig(SIGKILL, task);
+                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
                rcu_read_unlock();
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-#define PID_NAME_LEN sizeof("process_1234567890")
+#define PID_NAME_LEN 32
-static char name[PID_NAME_LEN];
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
        int ret;
        long pm_qos_class;
+        char name[PID_NAME_LEN];
-        lock_kernel();
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
                filp->private_data = (void *)pm_qos_class;
-                sprintf(name, "process_%d", current->pid);
+                snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
                ret = pm_qos_add_requirement(pm_qos_class, name,
                                        PM_QOS_DEFAULT_VALUE);
-                if (ret >= 0) {
+                if (ret >= 0)
-                        unlock_kernel();
                        return 0;
-                }
        }
-        unlock_kernel();
        return -EPERM;
 }
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_remove_requirement(pm_qos_class, name);
        return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 {
        s32 value;
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
        if (count != sizeof(s32))
                return -EINVAL;
        if (copy_from_user(&value, buf, sizeof(s32)))
                return -EFAULT;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_update_requirement(pm_qos_class, name, value);
        return  sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..bc7704b3a443 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 /*
 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
- * This is called from sys_timer_create with the new timer already locked.
+ * This is called from sys_timer_create() and do_cpu_nanosleep() with the
+ * new timer already all-zeros initialized.
 */
 int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                return -EINVAL;
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        new_timer->it.cpu.incr.sched = 0;
-        new_timer->it.cpu.expires.sched = 0;
        read_lock(&tasklist_lock);
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
@@ -983,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
        int maxfire;
        struct list_head *timers = tsk->cpu_timers;
        struct signal_struct *const sig = tsk->signal;
+        unsigned long soft;
        maxfire = 20;
        tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1031,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
        /*
         * Check for the special case thread timers.
         */
-        if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
-                unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+        if (soft != RLIM_INFINITY) {
-                unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
                if (hard != RLIM_INFINITY &&
                    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1044,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+                if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
-                        if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+                                soft += USEC_PER_SEC;
-                                sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+                                sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
-                                                                USEC_PER_SEC;
                        }
                        printk(KERN_INFO
                                "RT Watchdog Timeout: %s[%d]\n",
@@ -1061,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 }
-static void stop_process_timers(struct task_struct *tsk)
+static void stop_process_timers(struct signal_struct *sig)
 {
-        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+        struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
        if (!cputimer->running)
@@ -1072,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
        spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
        spin_unlock_irqrestore(&cputimer->lock, flags);
+        sig->cputime_expires.prof_exp = cputime_zero;
+        sig->cputime_expires.virt_exp = cputime_zero;
+        sig->cputime_expires.sched_exp = 0;
 }
 static u32 onecputick;
@@ -1122,6 +1126,7 @@ static void check_process_timers(struct task_struct *tsk,
        unsigned long long sum_sched_runtime, sched_expires;
        struct list_head *timers = sig->cpu_timers;
        struct task_cputime cputime;
+        unsigned long soft;
        /*
         * Don't sample the current process CPU clocks if there are no timers.
@@ -1132,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
            list_empty(&timers[CPUCLOCK_VIRT]) &&
            cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
            list_empty(&timers[CPUCLOCK_SCHED])) {
-                stop_process_timers(tsk);
+                stop_process_timers(sig);
                return;
        }
@@ -1194,11 +1199,13 @@ static void check_process_timers(struct task_struct *tsk,
                         SIGPROF);
        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
                         SIGVTALRM);
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+        if (soft != RLIM_INFINITY) {
                unsigned long psecs = cputime_to_secs(ptime);
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
                cputime_t x;
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+                if (psecs >= hard) {
                        /*
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
@@ -1206,17 +1213,17 @@ static void check_process_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+                if (psecs >= soft) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
-                        if (sig->rlim[RLIMIT_CPU].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_CPU].rlim_max) {
+                                soft++;
-                                sig->rlim[RLIMIT_CPU].rlim_cur++;
+                                sig->rlim[RLIMIT_CPU].rlim_cur = soft;
                        }
                }
-                x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+                x = secs_to_cputime(soft);
                if (cputime_eq(prof_expires, cputime_zero) ||
                    cputime_lt(x, prof_expires)) {
                        prof_expires = x;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
        return 0;
 }
-int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
 {
        *tp = ktime_to_timespec(KTIME_LOW_RES);
        return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting PM bugs, like
        suspend support.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        default n
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
 config PM_VERBOSE
        bool "Verbose Power Management debugging"
        depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
+config PM_SLEEP_ADVANCED_DEBUG
+        bool
+        depends on PM_ADVANCED_DEBUG
+        default n
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
          and the bus type drivers of the buses the devices are on are
          responsible for the actual handling of the autosuspend requests and
          wake-up events.
+config PM_OPS
+        bool
+        depends on PM_SLEEP || PM_RUNTIME
+        default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c3b81c30e5d5..43191815f874 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)           += process.o
 obj-$(CONFIG_SUSPEND)           += suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
-obj-$(CONFIG_HIBERNATION)       += swsusp.o hibernate.o snapshot.o swap.o user.o
+obj-$(CONFIG_HIBERNATION)       += hibernate.o snapshot.o swap.o user.o
 obj-$(CONFIG_HIBERNATION_NVS)   += hibernate_nvs.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1de..218e5af90156 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
 #include <linux/vt_kern.h>
 #include <linux/kbd_kern.h>
-#include <linux/console.h>
+#include <linux/vt.h>
 #include <linux/module.h>
 #include "power.h"
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
        if (orig_fgconsole < 0)
                return 1;
-        orig_kmsg = kmsg_redirect;
+        orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
-        kmsg_redirect = SUSPEND_CONSOLE;
        return 0;
 }
@@ -30,7 +29,7 @@ void pm_restore_console(void)
 {
        if (orig_fgconsole >= 0) {
                vt_move_to_console(orig_fgconsole, 0);
-                kmsg_redirect = orig_kmsg;
+                vt_kmsg_redirect(orig_kmsg);
        }
 }
 #endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04a9e90d248f..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/gfp.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
@@ -32,6 +33,7 @@ static int noresume = 0;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
+int in_suspend __nosavedata = 0;
 enum {
        HIBERNATION_INVALID,
@@ -202,6 +204,35 @@ static void platform_recover(int platform_mode)
 }
 /**
+ *      swsusp_show_speed - print the time elapsed between two events.
+ *      @start: Starting event.
+ *      @stop: Final event.
+ *      @nr_pages -     number of pages processed between @start and @stop
+ *      @msg -          introductory message to print
+ */
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+                        unsigned nr_pages, char *msg)
+{
+        s64 elapsed_centisecs64;
+        int centisecs;
+        int k;
+        int kps;
+        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+        centisecs = elapsed_centisecs64;
+        if (centisecs == 0)
+                centisecs = 1;  /* avoid div-by-zero */
+        k = nr_pages * (PAGE_SIZE / 1024);
+        kps = (k * 100) / centisecs;
+        printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+                        msg, k,
+                        centisecs / 100, centisecs % 100,
+                        kps / 1000, (kps % 1000) / 10);
+}
+/**
 *      create_image - freeze devices that need to be frozen with interrupts
 *      off, create the hibernation image and thaw those devices.  Control
 *      reappears in this routine after a restore.
@@ -293,6 +324,7 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -304,6 +336,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -321,6 +354,7 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -415,14 +449,17 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
        pm_restore_console();
        return error;
@@ -436,6 +473,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
+        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -451,6 +489,7 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -488,6 +527,7 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
index 39ac698ef836..fdcad9ed5a7b 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/hibernate_nvs.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/suspend.h>
 /*
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 347d2cc88cd0..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
                        == NOTIFY_BAD) ? -EINVAL : 0;
 }
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+{
+        return sprintf(buf, "%d\n", pm_async_enabled);
+}
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+                              const char *buf, size_t n)
+{
+        unsigned long val;
+        if (strict_strtoul(buf, 10, &val))
+                return -EINVAL;
+        if (val > 1)
+                return -EINVAL;
+        pm_async_enabled = val;
+        return n;
+}
+power_attr(pm_async);
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
 #endif
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+#ifdef CONFIG_PM_SLEEP
+        &pm_async_attr.attr,
+#ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#endif
        NULL,
 };
@@ -220,6 +249,7 @@ static struct attribute_group attr_group = {
 #ifdef CONFIG_PM_RUNTIME
 struct workqueue_struct *pm_wq;
+EXPORT_SYMBOL_GPL(pm_wq);
 static int __init pm_start_workqueue(void)
 {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cc2e55373b68..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
+#include <linux/delay.h>
 /* 
 * Timeout for stopping processes
@@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only)
        do_gettimeofday(&start);
        end_time = jiffies + TIMEOUT;
-        do {
+        while (true) {
                todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
@@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only)
                                todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                yield();                        /* Yield is okay here */
+                if (!todo || time_after(jiffies, end_time))
-                if (time_after(jiffies, end_time))
                        break;
-        } while (todo);
+                /*
+                 * We need to retry, but first give the freezing tasks some
+                 * time to enter the regrigerator.
+                 */
+                msleep(10);
+        }
        do_gettimeofday(&end);
        elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start);
@@ -82,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
                printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
                                "(%d tasks refusing to freeze):\n",
                                elapsed_csecs / 100, elapsed_csecs % 100, todo);
-                show_state();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
                        if (freezing(p) && !freezer_should_skip(p))
-                                printk(KERN_ERR " %s\n", p->comm);
+                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
                } while_each_thread(g, p);
@@ -139,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
                if (nosig_only && should_send_signal(p))
                        continue;
-                if (cgroup_frozen(p))
+                if (cgroup_freezing_or_frozen(p))
                        continue;
                thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..be861c26dda7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
 #include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1181,7 +1182,7 @@ static void free_unnecessary_pages(void)
        memory_bm_position_reset(&copy_bm);
-        while (to_free_normal > 0 && to_free_highmem > 0) {
+        while (to_free_normal > 0 || to_free_highmem > 0) {
                unsigned long pfn = memory_bm_next_pfn(&copy_bm);
                struct page *page = pfn_to_page(pfn);
@@ -1500,7 +1501,7 @@ asmlinkage int swsusp_save(void)
 {
        unsigned int nr_pages, nr_highmem;
-        printk(KERN_INFO "PM: Creating hibernation image: \n");
+        printk(KERN_INFO "PM: Creating hibernation image:\n");
        drain_local_pages(NULL);
        nr_pages = count_data_pages();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..56e7dbb8b996 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/gfp.h>
 #include "power.h"
@@ -189,6 +190,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
+        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
@@ -199,6 +201,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -215,6 +218,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 890f6b11b1d3..66824d71983a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
+#include <linux/slab.h>
 #include "power.h"
@@ -38,6 +39,107 @@ struct swsusp_header {
 static struct swsusp_header *swsusp_header;
+/**
+ *      The following functions are used for tracing the allocated
+ *      swap pages, so that they can be freed in case of an error.
+ */
+struct swsusp_extent {
+        struct rb_node node;
+        unsigned long start;
+        unsigned long end;
+};
+static struct rb_root swsusp_extents = RB_ROOT;
+static int swsusp_extents_insert(unsigned long swap_offset)
+{
+        struct rb_node **new = &(swsusp_extents.rb_node);
+        struct rb_node *parent = NULL;
+        struct swsusp_extent *ext;
+        /* Figure out where to put the new node */
+        while (*new) {
+                ext = container_of(*new, struct swsusp_extent, node);
+                parent = *new;
+                if (swap_offset < ext->start) {
+                        /* Try to merge */
+                        if (swap_offset == ext->start - 1) {
+                                ext->start--;
+                                return 0;
+                        }
+                        new = &((*new)->rb_left);
+                } else if (swap_offset > ext->end) {
+                        /* Try to merge */
+                        if (swap_offset == ext->end + 1) {
+                                ext->end++;
+                                return 0;
+                        }
+                        new = &((*new)->rb_right);
+                } else {
+                        /* It already is in the tree */
+                        return -EINVAL;
+                }
+        }
+        /* Add the new node and rebalance the tree. */
+        ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
+        if (!ext)
+                return -ENOMEM;
+        ext->start = swap_offset;
+        ext->end = swap_offset;
+        rb_link_node(&ext->node, parent, new);
+        rb_insert_color(&ext->node, &swsusp_extents);
+        return 0;
+}
+/**
+ *      alloc_swapdev_block - allocate a swap page and register that it has
+ *      been allocated, so that it can be freed in case of an error.
+ */
+sector_t alloc_swapdev_block(int swap)
+{
+        unsigned long offset;
+        offset = swp_offset(get_swap_page_of_type(swap));
+        if (offset) {
+                if (swsusp_extents_insert(offset))
+                        swap_free(swp_entry(swap, offset));
+                else
+                        return swapdev_block(swap, offset);
+        }
+        return 0;
+}
+/**
+ *      free_all_swap_pages - free swap pages allocated for saving image data.
+ *      It also frees the extents used to register which swap entres had been
+ *      allocated.
+ */
+void free_all_swap_pages(int swap)
+{
+        struct rb_node *node;
+        while ((node = swsusp_extents.rb_node)) {
+                struct swsusp_extent *ext;
+                unsigned long offset;
+                ext = container_of(node, struct swsusp_extent, node);
+                rb_erase(node, &swsusp_extents);
+                for (offset = ext->start; offset <= ext->end; offset++)
+                        swap_free(swp_entry(swap, offset));
+                kfree(ext);
+        }
+}
+int swsusp_swap_in_use(void)
+{
+        return (swsusp_extents.rb_node != NULL);
+}
 /*
 * General things
 */
@@ -336,7 +438,7 @@ static int save_image(struct swap_map_handle *handle,
                if (ret)
                        break;
                if (!(nr_pages % m))
-                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                        printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
                nr_pages++;
        }
        err2 = wait_on_bio_chain(&bio);
@@ -344,9 +446,9 @@ static int save_image(struct swap_map_handle *handle,
        if (!ret)
                ret = err2;
        if (!ret)
-                printk("\b\b\b\bdone\n");
+                printk(KERN_CONT "\b\b\b\bdone\n");
        else
-                printk("\n");
+                printk(KERN_CONT "\n");
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
        return ret;
 }
@@ -556,10 +658,6 @@ int swsusp_read(unsigned int *flags_p)
        struct swsusp_info *header;
        *flags_p = swsusp_header->flags;
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("PM: Image device not initialised\n");
-                return PTR_ERR(resume_bdev);
-        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 6a07f4dbf2f8..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * linux/kernel/power/swsusp.c
- *
- * This file provides code to write suspend image to swap and read it back.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- * I'd like to thank the following people for their work:
- *
- * Pavel Machek <pavel@ucw.cz>:
- * Modifications, defectiveness pointing, being with me at the very beginning,
- * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
- *
- * Steve Doddi <dirk@loth.demon.co.uk>:
- * Support the possibility of hardware state restoring.
- *
- * Raph <grey.havens@earthling.net>:
- * Support for preserving states of network devices and virtual console
- * (including X and svgatextmode)
- *
- * Kurt Garloff <garloff@suse.de>:
- * Straightened the critical function in order to prevent compilers from
- * playing tricks with local variables.
- *
- * Andreas Mohr <a.mohr@mailto.de>
- *
- * Alex Badea <vampire@go.ro>:
- * Fixed runaway init
- *
- * Rafael J. Wysocki <rjw@sisk.pl>
- * Reworked the freeing of memory and the handling of swap
- *
- * More state savers are welcome. Especially for the scsi layer...
- *
- * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
- */
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
-#include <linux/pm.h>
-#include <linux/swapops.h>
-#include <linux/bootmem.h>
-#include <linux/syscalls.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/rbtree.h>
-#include <linux/io.h>
-#include "power.h"
-int in_suspend __nosavedata = 0;
-/**
- *      The following functions are used for tracing the allocated
- *      swap pages, so that they can be freed in case of an error.
- */
-struct swsusp_extent {
-        struct rb_node node;
-        unsigned long start;
-        unsigned long end;
-};
-static struct rb_root swsusp_extents = RB_ROOT;
-static int swsusp_extents_insert(unsigned long swap_offset)
-{
-        struct rb_node **new = &(swsusp_extents.rb_node);
-        struct rb_node *parent = NULL;
-        struct swsusp_extent *ext;
-        /* Figure out where to put the new node */
-        while (*new) {
-                ext = container_of(*new, struct swsusp_extent, node);
-                parent = *new;
-                if (swap_offset < ext->start) {
-                        /* Try to merge */
-                        if (swap_offset == ext->start - 1) {
-                                ext->start--;
-                                return 0;
-                        }
-                        new = &((*new)->rb_left);
-                } else if (swap_offset > ext->end) {
-                        /* Try to merge */
-                        if (swap_offset == ext->end + 1) {
-                                ext->end++;
-                                return 0;
-                        }
-                        new = &((*new)->rb_right);
-                } else {
-                        /* It already is in the tree */
-                        return -EINVAL;
-                }
-        }
-        /* Add the new node and rebalance the tree. */
-        ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
-        if (!ext)
-                return -ENOMEM;
-        ext->start = swap_offset;
-        ext->end = swap_offset;
-        rb_link_node(&ext->node, parent, new);
-        rb_insert_color(&ext->node, &swsusp_extents);
-        return 0;
-}
-/**
- *      alloc_swapdev_block - allocate a swap page and register that it has
- *      been allocated, so that it can be freed in case of an error.
- */
-sector_t alloc_swapdev_block(int swap)
-{
-        unsigned long offset;
-        offset = swp_offset(get_swap_page_of_type(swap));
-        if (offset) {
-                if (swsusp_extents_insert(offset))
-                        swap_free(swp_entry(swap, offset));
-                else
-                        return swapdev_block(swap, offset);
-        }
-        return 0;
-}
-/**
- *      free_all_swap_pages - free swap pages allocated for saving image data.
- *      It also frees the extents used to register which swap entres had been
- *      allocated.
- */
-void free_all_swap_pages(int swap)
-{
-        struct rb_node *node;
-        while ((node = swsusp_extents.rb_node)) {
-                struct swsusp_extent *ext;
-                unsigned long offset;
-                ext = container_of(node, struct swsusp_extent, node);
-                rb_erase(node, &swsusp_extents);
-                for (offset = ext->start; offset <= ext->end; offset++)
-                        swap_free(swp_entry(swap, offset));
-                kfree(ext);
-        }
-}
-int swsusp_swap_in_use(void)
-{
-        return (swsusp_extents.rb_node != NULL);
-}
-/**
- *      swsusp_show_speed - print the time elapsed between two events represented by
- *      @start and @stop
- *
- *      @nr_pages -     number of pages processed between @start and @stop
- *      @msg -          introductory message to print
- */
-void swsusp_show_speed(struct timeval *start, struct timeval *stop,
-                        unsigned nr_pages, char *msg)
-{
-        s64 elapsed_centisecs64;
-        int centisecs;
-        int k;
-        int kps;
-        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-        centisecs = elapsed_centisecs64;
-        if (centisecs == 0)
-                centisecs = 1;  /* avoid div-by-zero */
-        k = nr_pages * (PAGE_SIZE / 1024);
-        kps = (k * 100) / centisecs;
-        printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
-                        msg, k,
-                        centisecs / 100, centisecs % 100,
-                        kps / 1000, (kps % 1000) / 10);
-}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..a8c96212bc1b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        return res;
 }
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+        if (printk_ratelimit())
+                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+                                "be removed soon, update your suspend-to-disk "
+                                "utilities\n",
+                                __builtin_return_address(0), cmd);
+}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->frozen = 0;
                break;
-        case SNAPSHOT_CREATE_IMAGE:
        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
                        break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->ready = 0;
                break;
-        case SNAPSHOT_PREF_IMAGE_SIZE:
        case SNAPSHOT_SET_IMAGE_SIZE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_AVAIL_SWAP_SIZE:
        case SNAPSHOT_AVAIL_SWAP:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_ALLOC_SWAP_PAGE:
        case SNAPSHOT_GET_SWAP_PAGE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
                        break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                if (!swsusp_swap_in_use()) {
                        /*
                         * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                error = -EINVAL;
                switch (arg) {
@@ -405,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                         * User space encodes device types as two-byte values,
                         * so we need to recode them
                         */
-                        swdev = old_decode_dev(swap_area.dev);
+                        swdev = new_decode_dev(swap_area.dev);
                        if (swdev) {
                                offset = swap_area.offset;
                                data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 6712a252b306..ee54355cfdf1 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,9 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
+#include <linux/ratelimit.h>
+#include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
 #include <asm/uaccess.h>
@@ -67,8 +70,6 @@ int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
 };
-static int saved_console_loglevel = -1;
 /*
 * divert printk() messages when there is a LITMUS^RT debug listener
 */
@@ -150,6 +151,7 @@ static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+static int saved_console_loglevel = -1;
 #ifdef CONFIG_KEXEC
 /*
@@ -263,38 +265,23 @@ static inline void boot_delay_msec(void)
 }
 #endif
-/*
+int do_syslog(int type, char __user *buf, int len, bool from_file)
- * Commands to do_syslog:
- *
- *      0 -- Close the log.  Currently a NOP.
- *      1 -- Open the log. Currently a NOP.
- *      2 -- Read from the log.
- *      3 -- Read all messages remaining in the ring buffer.
- *      4 -- Read and clear all messages remaining in the ring buffer
- *      5 -- Clear ring buffer.
- *      6 -- Disable printk's to console
- *      7 -- Enable printk's to console
- *      8 -- Set level of messages printed to console
- *      9 -- Return number of unread characters in the log buffer
- *     10 -- Return size of the log buffer
- */
-int do_syslog(int type, char __user *buf, int len)
 {
        unsigned i, j, limit, count;
        int do_clear = 0;
        char c;
        int error = 0;
-        error = security_syslog(type);
+        error = security_syslog(type, from_file);
        if (error)
                return error;
        switch (type) {
-        case 0:         /* Close log */
+        case SYSLOG_ACTION_CLOSE:       /* Close log */
                break;
-        case 1:         /* Open log */
+        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
-        case 2:         /* Read from log */
+        case SYSLOG_ACTION_READ:        /* Read from log */
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -325,10 +312,12 @@ int do_syslog(int type, char __user *buf, int len)
                if (!error)
                        error = i;
                break;
-        case 4:         /* Read/clear last kernel messages */
+        /* Read/clear last kernel messages */
+        case SYSLOG_ACTION_READ_CLEAR:
                do_clear = 1;
                /* FALL THRU */
-        case 3:         /* Read last kernel messages */
+        /* Read last kernel messages */
+        case SYSLOG_ACTION_READ_ALL:
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -381,21 +370,25 @@ int do_syslog(int type, char __user *buf, int len)
                        }
                }
                break;
-        case 5:         /* Clear ring buffer */
+        /* Clear ring buffer */
+        case SYSLOG_ACTION_CLEAR:
                logged_chars = 0;
                break;
-        case 6:         /* Disable logging to console */
+        /* Disable logging to console */
+        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == -1)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
-        case 7:         /* Enable logging to console */
+        /* Enable logging to console */
+        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != -1) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = -1;
                }
                break;
-        case 8:         /* Set level of messages printed to console */
+        /* Set level of messages printed to console */
+        case SYSLOG_ACTION_CONSOLE_LEVEL:
                error = -EINVAL;
                if (len < 1 || len > 8)
                        goto out;
@@ -406,10 +399,12 @@ int do_syslog(int type, char __user *buf, int len)
                saved_console_loglevel = -1;
                error = 0;
                break;
-        case 9:         /* Number of chars in the log buffer */
+        /* Number of chars in the log buffer */
+        case SYSLOG_ACTION_SIZE_UNREAD:
                error = log_end - log_start;
                break;
-        case 10:        /* Size of the log buffer */
+        /* Size of the log buffer */
+        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
@@ -422,7 +417,7 @@ out:
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-        return do_syslog(type, buf, len);
+        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 /*
@@ -1386,11 +1381,11 @@ late_initcall(disable_boot_consoles);
 */
 DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
-int printk_ratelimit(void)
+int __printk_ratelimit(const char *func)
 {
-        return __ratelimit(&printk_ratelimit_state);
+        return ___ratelimit(&printk_ratelimit_state, func);
 }
-EXPORT_SYMBOL(printk_ratelimit);
+EXPORT_SYMBOL(__printk_ratelimit);
 /**
 * printk_timed_ratelimit - caller-controlled printk ratelimiting
@@ -1414,4 +1409,123 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
        return false;
 }
 EXPORT_SYMBOL(printk_timed_ratelimit);
+static DEFINE_SPINLOCK(dump_list_lock);
+static LIST_HEAD(dump_list);
+/**
+ * kmsg_dump_register - register a kernel log dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Adds a kernel log dumper to the system. The dump callback in the
+ * structure will be called when the kernel oopses or panics and must be
+ * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
+ */
+int kmsg_dump_register(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        int err = -EBUSY;
+        /* The dump callback needs to be set */
+        if (!dumper->dump)
+                return -EINVAL;
+        spin_lock_irqsave(&dump_list_lock, flags);
+        /* Don't allow registering multiple times */
+        if (!dumper->registered) {
+                dumper->registered = 1;
+                list_add_tail(&dumper->list, &dump_list);
+                err = 0;
+        }
+        spin_unlock_irqrestore(&dump_list_lock, flags);
+        return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_register);
+/**
+ * kmsg_dump_unregister - unregister a kmsg dumper.
+ * @dumper: pointer to the kmsg_dumper structure
+ *
+ * Removes a dump device from the system. Returns zero on success and
+ * %-EINVAL otherwise.
+ */
+int kmsg_dump_unregister(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        int err = -EINVAL;
+        spin_lock_irqsave(&dump_list_lock, flags);
+        if (dumper->registered) {
+                dumper->registered = 0;
+                list_del(&dumper->list);
+                err = 0;
+        }
+        spin_unlock_irqrestore(&dump_list_lock, flags);
+        return err;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
+static const char const *kmsg_reasons[] = {
+        [KMSG_DUMP_OOPS]        = "oops",
+        [KMSG_DUMP_PANIC]       = "panic",
+        [KMSG_DUMP_KEXEC]       = "kexec",
+};
+static const char *kmsg_to_str(enum kmsg_dump_reason reason)
+{
+        if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
+                return "unknown";
+        return kmsg_reasons[reason];
+}
+/**
+ * kmsg_dump - dump kernel log to kernel message dumpers.
+ * @reason: the reason (oops, panic etc) for dumping
+ *
+ * Iterate through each of the dump devices and call the oops/panic
+ * callbacks with the log buffer.
+ */
+void kmsg_dump(enum kmsg_dump_reason reason)
+{
+        unsigned long end;
+        unsigned chars;
+        struct kmsg_dumper *dumper;
+        const char *s1, *s2;
+        unsigned long l1, l2;
+        unsigned long flags;
+        /* Theoretically, the log could move on after we do this, but
+           there's not a lot we can do about that. The new messages
+           will overwrite the start of what we dump. */
+        spin_lock_irqsave(&logbuf_lock, flags);
+        end = log_end & LOG_BUF_MASK;
+        chars = logged_chars;
+        spin_unlock_irqrestore(&logbuf_lock, flags);
+        if (logged_chars > end) {
+                s1 = log_buf + log_buf_len - logged_chars + end;
+                l1 = logged_chars - end;
+                s2 = log_buf;
+                l2 = end;
+        } else {
+                s1 = "";
+                l1 = 0;
+                s2 = log_buf + end - logged_chars;
+                l2 = logged_chars;
+        }
+        if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
+                printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
+                                kmsg_to_str(reason));
+                return;
+        }
+        list_for_each_entry(dumper, &dump_list, list)
+                dumper->dump(dumper, reason, s1, l1, s2, l2);
+        spin_unlock_irqrestore(&dump_list_lock, flags);
+}
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
                return 0;
        prof_buffer = vmalloc(buffer_bytes);
-        if (prof_buffer)
+        if (prof_buffer) {
+                memset(prof_buffer, 0, buffer_bytes);
                return 0;
+        }
        free_cpumask_var(prof_cpu_mask);
        return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/regset.h>
 /*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
        return 0;
 }
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static const struct user_regset *
+find_regset(const struct user_regset_view *view, unsigned int type)
+{
+        const struct user_regset *regset;
+        int n;
+        for (n = 0; n < view->n; ++n) {
+                regset = view->regsets + n;
+                if (regset->core_note_type == type)
+                        return regset;
+        }
+        return NULL;
+}
+static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
+                         struct iovec *kiov)
+{
+        const struct user_regset_view *view = task_user_regset_view(task);
+        const struct user_regset *regset = find_regset(view, type);
+        int regset_no;
+        if (!regset || (kiov->iov_len % regset->size) != 0)
+                return -EINVAL;
+        regset_no = regset - view->regsets;
+        kiov->iov_len = min(kiov->iov_len,
+                            (__kernel_size_t) (regset->n * regset->size));
+        if (req == PTRACE_GETREGSET)
+                return copy_regset_to_user(task, view, regset_no, 0,
+                                           kiov->iov_len, kiov->iov_base);
+        else
+                return copy_regset_from_user(task, view, regset_no, 0,
+                                             kiov->iov_len, kiov->iov_base);
+}
+#endif
 int ptrace_request(struct task_struct *child, long request,
                   long addr, long data)
 {
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
                        return 0;
                return ptrace_resume(child, request, SIGKILL);
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct iovec __user *uiov = (struct iovec __user *) data;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(kiov.iov_base, &uiov->iov_base) ||
+                    __get_user(kiov.iov_len, &uiov->iov_len))
+                        return -EFAULT;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                break;
        }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                else
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct compat_iovec __user *uiov =
+                        (struct compat_iovec __user *) datap;
+                compat_uptr_t ptr;
+                compat_size_t len;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(ptr, &uiov->iov_base) ||
+                    __get_user(len, &uiov->iov_len))
+                        return -EFAULT;
+                kiov.iov_base = compat_ptr(ptr);
+                kiov.iov_len = len;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/range.h>
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+        if (start >= end)
+                return nr_range;
+        /* Out of slots: */
+        if (nr_range >= az)
+                return nr_range;
+        range[nr_range].start = start;
+        range[nr_range].end = end;
+        nr_range++;
+        return nr_range;
+}
+int add_range_with_merge(struct range *range, int az, int nr_range,
+                     u64 start, u64 end)
+{
+        int i;
+        if (start >= end)
+                return nr_range;
+        /* Try to merge it with old one: */
+        for (i = 0; i < nr_range; i++) {
+                u64 final_start, final_end;
+                u64 common_start, common_end;
+                if (!range[i].end)
+                        continue;
+                common_start = max(range[i].start, start);
+                common_end = min(range[i].end, end);
+                if (common_start > common_end)
+                        continue;
+                final_start = min(range[i].start, start);
+                final_end = max(range[i].end, end);
+                range[i].start = final_start;
+                range[i].end =  final_end;
+                return nr_range;
+        }
+        /* Need to add it: */
+        return add_range(range, az, nr_range, start, end);
+}
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+        int i, j;
+        if (start >= end)
+                return;
+        for (j = 0; j < az; j++) {
+                if (!range[j].end)
+                        continue;
+                if (start <= range[j].start && end >= range[j].end) {
+                        range[j].start = 0;
+                        range[j].end = 0;
+                        continue;
+                }
+                if (start <= range[j].start && end < range[j].end &&
+                    range[j].start < end) {
+                        range[j].start = end;
+                        continue;
+                }
+                if (start > range[j].start && end >= range[j].end &&
+                    range[j].end > start) {
+                        range[j].end = start;
+                        continue;
+                }
+                if (start > range[j].start && end < range[j].end) {
+                        /* Find the new spare: */
+                        for (i = 0; i < az; i++) {
+                                if (range[i].end == 0)
+                                        break;
+                        }
+                        if (i < az) {
+                                range[i].end = range[j].end;
+                                range[i].start = end;
+                        } else {
+                                printk(KERN_ERR "run of slot in ranges\n");
+                        }
+                        range[j].end = start;
+                        continue;
+                }
+        }
+}
+static int cmp_range(const void *x1, const void *x2)
+{
+        const struct range *r1 = x1;
+        const struct range *r2 = x2;
+        s64 start1, start2;
+        start1 = r1->start;
+        start2 = r2->start;
+        return start1 - start2;
+}
+int clean_sort_range(struct range *range, int az)
+{
+        int i, j, k = az - 1, nr_range = 0;
+        for (i = 0; i < k; i++) {
+                if (range[i].end)
+                        continue;
+                for (j = k; j > i; j--) {
+                        if (range[j].end) {
+                                k = j;
+                                break;
+                        }
+                }
+                if (j == i)
+                        break;
+                range[i].start = range[k].start;
+                range[i].end   = range[k].end;
+                range[k].start = 0;
+                range[k].end   = 0;
+                k--;
+        }
+        /* count it */
+        for (i = 0; i < az; i++) {
+                if (!range[i].end) {
+                        nr_range = i;
+                        break;
+                }
+        }
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+        return nr_range;
+}
+void sort_range(struct range *range, int nr_range)
+{
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 400183346ad2..49d808e833b0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,143 +45,91 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/kernel_stat.h>
+#include <linux/hardirq.h>
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
 EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-int rcu_scheduler_active __read_mostly;
+static struct lock_class_key rcu_bh_lock_key;
+struct lockdep_map rcu_bh_lock_map =
+        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
+EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
-/*
+static struct lock_class_key rcu_sched_lock_key;
- * Awaken the corresponding synchronize_rcu() instance now that a
+struct lockdep_map rcu_sched_lock_map =
- * grace period has elapsed.
+        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
- */
+EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
-void wakeme_after_rcu(struct rcu_head  *head)
+#endif
-{
-        struct rcu_synchronize *rcu;
-        rcu = container_of(head, struct rcu_synchronize, head);
+int rcu_scheduler_active __read_mostly;
-        complete(&rcu->completion);
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-}
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/**
+int debug_lockdep_rcu_enabled(void)
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void synchronize_rcu(void)
 {
-        struct rcu_synchronize rcu;
+        return rcu_scheduler_active && debug_locks &&
+               current->lockdep_recursion == 0;
-        if (!rcu_scheduler_active)
-                return;
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
 }
-EXPORT_SYMBOL_GPL(synchronize_rcu);
+EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 /**
- * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
- *
- * Control will return to the caller some time after a full rcu-sched
- * grace period has elapsed, in other words after all currently executing
- * rcu-sched read-side critical sections have completed.   These read-side
- * critical sections are delimited by rcu_read_lock_sched() and
- * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
- * local_irq_disable(), and so on may be used in place of
- * rcu_read_lock_sched().
 *
- * This means that all preempt_disable code sequences, including NMI and
+ * Check for bottom half being disabled, which covers both the
- * hardware-interrupt handlers, in progress on entry will have completed
+ * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
- * before this primitive returns.  However, this does not guarantee that
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * softirq handlers will have completed, since in some kernels, these
+ * will show the situation.
- * handlers can run in process context, and can block.
 *
- * This primitive provides the guarantees made by the (now removed)
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
- * synchronize_kernel() API.  In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
 */
-void synchronize_sched(void)
+int rcu_read_lock_bh_held(void)
 {
-        struct rcu_synchronize rcu;
+        if (!debug_lockdep_rcu_enabled())
+                return 1;
-        if (rcu_blocking_is_gp())
+        return in_softirq();
-                return;
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_sched(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
 }
-EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
-/**
- * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
- *
- * Control will return to the caller some time after a full rcu_bh grace
- * period has elapsed, in other words after all currently executing rcu_bh
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
- * and may be nested.
- */
-void synchronize_rcu_bh(void)
-{
-        struct rcu_synchronize rcu;
-        if (rcu_blocking_is_gp())
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-                return;
-        init_completion(&rcu.completion);
+/*
-        /* Will wake me after RCU finished. */
+ * This function is invoked towards the end of the scheduler's initialization
-        call_rcu_bh(&rcu.head, wakeme_after_rcu);
+ * process.  Before this is called, the idle task might contain
-        /* Wait for it. */
+ * RCU read-side critical sections (during which time, this idle
-        wait_for_completion(&rcu.completion);
+ * task is booting the system).  After this function is called, the
-}
+ * idle tasks are prohibited from containing RCU read-side critical
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+ * sections.
+ */
-static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
+void rcu_scheduler_starting(void)
-                unsigned long action, void *hcpu)
 {
-        return rcu_cpu_notify(self, action, hcpu);
+        WARN_ON(num_online_cpus() != 1);
+        WARN_ON(nr_context_switches() > 0);
+        rcu_scheduler_active = 1;
 }
-void __init rcu_init(void)
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
+void wakeme_after_rcu(struct rcu_head  *head)
 {
-        int i;
+        struct rcu_synchronize *rcu;
-        __rcu_init();
-        cpu_notifier(rcu_barrier_cpu_hotplug, 0);
-        /*
+        rcu = container_of(head, struct rcu_synchronize, head);
-         * We don't need protection against CPU-hotplug here because
+        complete(&rcu->completion);
-         * this is called early in boot, before either interrupts
-         * or the scheduler are operational.
-         */
-        for_each_online_cpu(i)
-                rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
 }
-void rcu_scheduler_starting(void)
+#ifdef CONFIG_PROVE_RCU
+/*
+ * wrapper function to avoid #include problems.
+ */
+int rcu_my_thread_group_empty(void)
 {
-        WARN_ON(num_online_cpus() != 1);
+        return thread_group_empty(current);
-        WARN_ON(nr_context_switches() > 0);
-        rcu_scheduler_active = 1;
 }
+EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
+#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
new file mode 100644
index 000000000000..9f6d9ff2572c
--- /dev/null
+++ b/kernel/rcutiny.c
@@ -0,0 +1,282 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ *              Documentation/RCU
+ */
+#include <linux/moduleparam.h>
+#include <linux/completion.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/cpu.h>
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+        struct rcu_head **curtail;      /* ->next pointer of last CB. */
+};
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+        .donetail       = &rcu_ctrlblk.rcucblist,
+        .curtail        = &rcu_ctrlblk.rcucblist,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+        .donetail       = &rcu_bh_ctrlblk.rcucblist,
+        .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
+#ifdef CONFIG_NO_HZ
+static long rcu_dynticks_nesting = 1;
+/*
+ * Enter dynticks-idle mode, which is an extended quiescent state
+ * if we have fully entered that mode (i.e., if the new value of
+ * dynticks_nesting is zero).
+ */
+void rcu_enter_nohz(void)
+{
+        if (--rcu_dynticks_nesting == 0)
+                rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+}
+/*
+ * Exit dynticks-idle mode, so that we are no longer in an extended
+ * quiescent state.
+ */
+void rcu_exit_nohz(void)
+{
+        rcu_dynticks_nesting++;
+}
+#endif /* #ifdef CONFIG_NO_HZ */
+/*
+ * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
+ * Also disable irqs to avoid confusion due to interrupt handlers
+ * invoking call_rcu().
+ */
+static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        if (rcp->rcucblist != NULL &&
+            rcp->donetail != rcp->curtail) {
+                rcp->donetail = rcp->curtail;
+                local_irq_restore(flags);
+                return 1;
+        }
+        local_irq_restore(flags);
+        return 0;
+}
+/*
+ * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
+ * are at it, given that any rcu quiescent state is also an rcu_bh
+ * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
+ */
+void rcu_sched_qs(int cpu)
+{
+        if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+                raise_softirq(RCU_SOFTIRQ);
+}
+/*
+ * Record an rcu_bh quiescent state.
+ */
+void rcu_bh_qs(int cpu)
+{
+        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
+                raise_softirq(RCU_SOFTIRQ);
+}
+/*
+ * Check to see if the scheduling-clock interrupt came from an extended
+ * quiescent state, and, if so, tell RCU about it.
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+        if (user ||
+            (idle_cpu(cpu) &&
+             !in_softirq() &&
+             hardirq_count() <= (1 << HARDIRQ_SHIFT)))
+                rcu_sched_qs(cpu);
+        else if (!in_softirq())
+                rcu_bh_qs(cpu);
+}
+/*
+ * Helper function for rcu_process_callbacks() that operates on the
+ * specified rcu_ctrlkblk structure.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+{
+        struct rcu_head *next, *list;
+        unsigned long flags;
+        /* If no RCU callbacks ready to invoke, just return. */
+        if (&rcp->rcucblist == rcp->donetail)
+                return;
+        /* Move the ready-to-invoke callbacks to a local list. */
+        local_irq_save(flags);
+        list = rcp->rcucblist;
+        rcp->rcucblist = *rcp->donetail;
+        *rcp->donetail = NULL;
+        if (rcp->curtail == rcp->donetail)
+                rcp->curtail = &rcp->rcucblist;
+        rcp->donetail = &rcp->rcucblist;
+        local_irq_restore(flags);
+        /* Invoke the callbacks on the local list. */
+        while (list) {
+                next = list->next;
+                prefetch(next);
+                list->func(list);
+                list = next;
+        }
+}
+/*
+ * Invoke any callbacks whose grace period has completed.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+        __rcu_process_callbacks(&rcu_ctrlblk);
+        __rcu_process_callbacks(&rcu_bh_ctrlblk);
+}
+/*
+ * Wait for a grace period to elapse.  But it is illegal to invoke
+ * synchronize_sched() from within an RCU read-side critical section.
+ * Therefore, any legal call to synchronize_sched() is a quiescent
+ * state, and so on a UP system, synchronize_sched() need do nothing.
+ * Ditto for synchronize_rcu_bh().  (But Lai Jiangshan points out the
+ * benefits of doing might_sleep() to reduce latency.)
+ *
+ * Cool, huh?  (Due to Josh Triplett.)
+ *
+ * But we want to make this a static inline later.
+ */
+void synchronize_sched(void)
+{
+        cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+void synchronize_rcu_bh(void)
+{
+        synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+/*
+ * Helper function for call_rcu() and call_rcu_bh().
+ */
+static void __call_rcu(struct rcu_head *head,
+                       void (*func)(struct rcu_head *rcu),
+                       struct rcu_ctrlblk *rcp)
+{
+        unsigned long flags;
+        head->func = func;
+        head->next = NULL;
+        local_irq_save(flags);
+        *rcp->curtail = head;
+        rcp->curtail = &head->next;
+        local_irq_restore(flags);
+}
+/*
+ * Post an RCU callback to be invoked after the end of an RCU grace
+ * period.  But since we have but one CPU, that would be after any
+ * quiescent state.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+        __call_rcu(head, func, &rcu_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+/*
+ * Post an RCU bottom-half callback to be invoked after any subsequent
+ * quiescent state.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+        __call_rcu(head, func, &rcu_bh_ctrlblk);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+void rcu_barrier(void)
+{
+        struct rcu_synchronize rcu;
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+void rcu_barrier_bh(void)
+{
+        struct rcu_synchronize rcu;
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu_bh(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+void rcu_barrier_sched(void)
+{
+        struct rcu_synchronize rcu;
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu_sched(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+void __init rcu_init(void)
+{
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 697c0a0229d4..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;         /* Start/stop testing interval (in sec) */
 static int irqreader = 1;       /* RCU readers from irq (timers). */
+static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
+static int fqs_holdoff = 0;     /* Hold time within burst (us). */
+static int fqs_stutter = 3;     /* Wait time between bursts (s). */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
 module_param(irqreader, int, 0444);
 MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+module_param(fqs_duration, int, 0444);
+MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
+module_param(fqs_holdoff, int, 0444);
+MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+module_param(fqs_stutter, int, 0444);
+MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
+static struct task_struct *fqs_task;
 #define RCU_TORTURE_PIPE_LEN 10
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
        void (*cb_barrier)(void);
+        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
        char *name;
@@ -327,6 +338,11 @@ rcu_torture_cb(struct rcu_head *p)
                cur_ops->deferred_free(rp);
 }
+static int rcu_no_completed(void)
+{
+        return 0;
+}
 static void rcu_torture_deferred_free(struct rcu_torture *p)
 {
        call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@ -342,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
        .cb_barrier     = rcu_barrier,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu"
@@ -383,11 +400,28 @@ static struct rcu_torture_ops rcu_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_rcu,
        .cb_barrier     = NULL,
+        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_sync"
 };
+static struct rcu_torture_ops rcu_expedited_ops = {
+        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
+        .readlock       = rcu_torture_read_lock,
+        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
+        .readunlock     = rcu_torture_read_unlock,
+        .completed      = rcu_no_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = synchronize_rcu_expedited,
+        .cb_barrier     = NULL,
+        .fqs            = rcu_force_quiescent_state,
+        .stats          = NULL,
+        .irq_capable    = 1,
+        .name           = "rcu_expedited"
+};
 /*
 * Definitions for rcu_bh torture testing.
 */
@@ -445,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = rcu_bh_torture_synchronize,
        .cb_barrier     = rcu_barrier_bh,
+        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_bh"
@@ -460,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = rcu_bh_torture_synchronize,
        .cb_barrier     = NULL,
+        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "rcu_bh_sync"
@@ -547,6 +583,25 @@ static struct rcu_torture_ops srcu_ops = {
        .name           = "srcu"
 };
+static void srcu_torture_synchronize_expedited(void)
+{
+        synchronize_srcu_expedited(&srcu_ctl);
+}
+static struct rcu_torture_ops srcu_expedited_ops = {
+        .init           = srcu_torture_init,
+        .cleanup        = srcu_torture_cleanup,
+        .readlock       = srcu_torture_read_lock,
+        .read_delay     = srcu_read_delay,
+        .readunlock     = srcu_torture_read_unlock,
+        .completed      = srcu_torture_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = srcu_torture_synchronize_expedited,
+        .cb_barrier     = NULL,
+        .stats          = srcu_torture_stats,
+        .name           = "srcu_expedited"
+};
 /*
 * Definitions for sched torture testing.
 */
@@ -562,11 +617,6 @@ static void sched_torture_read_unlock(int idx)
        preempt_enable();
 }
-static int sched_torture_completed(void)
-{
-        return 0;
-}
 static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
 {
        call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@ -583,25 +633,27 @@ static struct rcu_torture_ops sched_ops = {
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-        .completed      = sched_torture_completed,
+        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = rcu_barrier_sched,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "sched"
 };
-static struct rcu_torture_ops sched_ops_sync = {
+static struct rcu_torture_ops sched_sync_ops = {
        .init           = rcu_sync_torture_init,
        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-        .completed      = sched_torture_completed,
+        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = NULL,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
        .name           = "sched_sync"
 };
@@ -612,16 +664,49 @@ static struct rcu_torture_ops sched_expedited_ops = {
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-        .completed      = sched_torture_completed,
+        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_sched_expedited,
        .cb_barrier     = NULL,
+        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = rcu_expedited_torture_stats,
        .irq_capable    = 1,
        .name           = "sched_expedited"
 };
 /*
+ * RCU torture force-quiescent-state kthread.  Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+        unsigned long fqs_resume_time;
+        int fqs_burst_remaining;
+        VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+        do {
+                fqs_resume_time = jiffies + fqs_stutter * HZ;
+                while (jiffies - fqs_resume_time > LONG_MAX) {
+                        schedule_timeout_interruptible(1);
+                }
+                fqs_burst_remaining = fqs_duration;
+                while (fqs_burst_remaining > 0) {
+                        cur_ops->fqs();
+                        udelay(fqs_holdoff);
+                        fqs_burst_remaining -= fqs_holdoff;
+                }
+                rcu_stutter_wait("rcu_torture_fqs");
+        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+        VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+        rcutorture_shutdown_absorb("rcu_torture_fqs");
+        while (!kthread_should_stop())
+                schedule_timeout_uninterruptible(1);
+        return 0;
+}
+/*
 * RCU torture writer kthread.  Repeatedly substitutes a new structure
 * for that pointed to by rcu_torture_current, freeing the old structure
 * after a series of grace periods (the "pipeline").
@@ -711,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
        idx = cur_ops->readlock();
        completed = cur_ops->completed();
-        p = rcu_dereference(rcu_torture_current);
+        p = rcu_dereference_check(rcu_torture_current,
+                                  rcu_read_lock_held() ||
+                                  rcu_read_lock_bh_held() ||
+                                  rcu_read_lock_sched_held() ||
+                                  srcu_read_lock_held(&srcu_ctl));
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
@@ -729,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        ++__get_cpu_var(rcu_torture_count)[pipe_count];
+        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
        }
-        ++__get_cpu_var(rcu_torture_batch)[completed];
+        __this_cpu_inc(rcu_torture_batch[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
 }
@@ -764,11 +853,15 @@ rcu_torture_reader(void *arg)
        do {
                if (irqreader && cur_ops->irq_capable) {
                        if (!timer_pending(&t))
-                                mod_timer(&t, 1);
+                                mod_timer(&t, jiffies + 1);
                }
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
-                p = rcu_dereference(rcu_torture_current);
+                p = rcu_dereference_check(rcu_torture_current,
+                                          rcu_read_lock_held() ||
+                                          rcu_read_lock_bh_held() ||
+                                          rcu_read_lock_sched_held() ||
+                                          srcu_read_lock_held(&srcu_ctl));
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
@@ -784,13 +877,13 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                ++__get_cpu_var(rcu_torture_count)[pipe_count];
+                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
-                ++__get_cpu_var(rcu_torture_batch)[completed];
+                __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
@@ -996,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
        printk(KERN_ALERT "%s" TORTURE_FLAG
                "--- %s: nreaders=%d nfakewriters=%d "
                "stat_interval=%d verbose=%d test_no_idle_hz=%d "
-                "shuffle_interval=%d stutter=%d irqreader=%d\n",
+                "shuffle_interval=%d stutter=%d irqreader=%d "
+                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-                stutter, irqreader);
+                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
 }
 static struct notifier_block rcutorture_nb = {
@@ -1075,6 +1169,12 @@ rcu_torture_cleanup(void)
        }
        stats_task = NULL;
+        if (fqs_task) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
+                kthread_stop(fqs_task);
+        }
+        fqs_task = NULL;
        /* Wait for all RCU callbacks to fire.  */
        if (cur_ops->cb_barrier != NULL)
@@ -1097,9 +1197,10 @@ rcu_torture_init(void)
        int cpu;
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] =
-                { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
-                  &sched_expedited_ops,
+                  &rcu_bh_ops, &rcu_bh_sync_ops,
-                  &srcu_ops, &sched_ops, &sched_ops_sync, };
+                  &srcu_ops, &srcu_expedited_ops,
+                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -1110,11 +1211,20 @@ rcu_torture_init(void)
                        break;
        }
        if (i == ARRAY_SIZE(torture_ops)) {
-                printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+                printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
                       torture_type);
+                printk(KERN_ALERT "rcu-torture types:");
+                for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+                        printk(KERN_ALERT " %s", torture_ops[i]->name);
+                printk(KERN_ALERT "\n");
                mutex_unlock(&fullstop_mutex);
                return -EINVAL;
        }
+        if (cur_ops->fqs == NULL && fqs_duration != 0) {
+                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+                                  "fqs_duration, fqs disabled.\n");
+                fqs_duration = 0;
+        }
        if (cur_ops->init)
                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1243,6 +1353,19 @@ rcu_torture_init(void)
                        goto unwind;
                }
        }
+        if (fqs_duration < 0)
+                fqs_duration = 0;
+        if (fqs_duration) {
+                /* Create the stutter thread */
+                fqs_task = kthread_run(rcu_torture_fqs, NULL,
+                                       "rcu_torture_fqs");
+                if (IS_ERR(fqs_task)) {
+                        firsterr = PTR_ERR(fqs_task);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
+                        fqs_task = NULL;
+                        goto unwind;
+                }
+        }
        register_reboot_notifier(&rcutorture_nb);
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f3077c0ab181..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -51,22 +51,25 @@
 /* Data structures. */
+static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 #define RCU_STATE_INITIALIZER(name) { \
        .level = { &name.node[0] }, \
        .levelcnt = { \
                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
                NUM_RCU_LVL_1, \
                NUM_RCU_LVL_2, \
-                NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+                NUM_RCU_LVL_3, \
+                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
        }, \
        .signaled = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-        .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
        .orphan_cbs_list = NULL, \
        .orphan_cbs_tail = &name.orphan_cbs_list, \
        .orphan_qlen = 0, \
-        .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
 }
@@ -77,7 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 /*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
@@ -98,7 +100,7 @@ void rcu_sched_qs(int cpu)
        struct rcu_data *rdp;
        rdp = &per_cpu(rcu_sched_data, cpu);
-        rdp->passed_quiesc_completed = rdp->completed;
+        rdp->passed_quiesc_completed = rdp->gpnum - 1;
        barrier();
        rdp->passed_quiesc = 1;
        rcu_preempt_note_context_switch(cpu);
@@ -109,7 +111,7 @@ void rcu_bh_qs(int cpu)
        struct rcu_data *rdp;
        rdp = &per_cpu(rcu_bh_data, cpu);
-        rdp->passed_quiesc_completed = rdp->completed;
+        rdp->passed_quiesc_completed = rdp->gpnum - 1;
        barrier();
        rdp->passed_quiesc = 1;
 }
@@ -151,6 +153,24 @@ long rcu_batches_completed_bh(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 /*
+ * Force a quiescent state for RCU BH.
+ */
+void rcu_bh_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_bh_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_sched_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+/*
 * Does the CPU have callbacks ready to be invoked?
 */
 static int
@@ -335,28 +355,9 @@ void rcu_irq_exit(void)
                set_need_resched();
 }
-/*
- * Record the specified "completed" value, which is later used to validate
- * dynticks counter manipulations.  Specify "rsp->completed - 1" to
- * unconditionally invalidate any future dynticks manipulations (which is
- * useful at the beginning of a grace period).
- */
-static void dyntick_record_completed(struct rcu_state *rsp, long comp)
-{
-        rsp->dynticks_completed = comp;
-}
 #ifdef CONFIG_SMP
 /*
- * Recall the previously recorded value of the completion for dynticks.
- */
-static long dyntick_recall_completed(struct rcu_state *rsp)
-{
-        return rsp->dynticks_completed;
-}
-/*
 * Snapshot the specified CPU's dynticks counter so that we can later
 * credit them with an implicit quiescent state.  Return 1 if this CPU
 * is in dynticks idle mode, which is an extended quiescent state.
@@ -419,24 +420,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #else /* #ifdef CONFIG_NO_HZ */
-static void dyntick_record_completed(struct rcu_state *rsp, long comp)
-{
-}
 #ifdef CONFIG_SMP
-/*
- * If there are no dynticks, then the only way that a CPU can passively
- * be in a quiescent state is to be offline.  Unlike dynticks idle, which
- * is a point in time during the prior (already finished) grace period,
- * an offline CPU is always in a quiescent state, and thus can be
- * unconditionally applied.  So just return the current value of completed.
- */
-static long dyntick_recall_completed(struct rcu_state *rsp)
-{
-        return rsp->completed;
-}
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
        return 0;
@@ -468,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        /* Only let one CPU complain about others per time interval. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        delta = jiffies - rsp->jiffies_stall;
        if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
        rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -481,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * due to CPU offlining.
         */
        rcu_print_task_stall(rnp);
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /* OK, time to rat on our buddy... */
        printk(KERN_ERR "INFO: RCU detected CPU stalls:");
        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                rcu_print_task_stall(rnp);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                if (rnp->qsmask == 0)
                        continue;
                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -498,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
               smp_processor_id(), (long)(jiffies - rsp->gp_start));
        trigger_all_cpu_backtrace();
+        /* If so configured, complain about tasks blocking the grace period. */
+        rcu_print_detail_task_stall(rsp);
        force_quiescent_state(rsp, 0);  /* Kick them all. */
 }
@@ -510,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
                        smp_processor_id(), jiffies - rsp->gp_start);
        trigger_all_cpu_backtrace();
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
                rsp->jiffies_stall =
                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        set_need_resched();  /* kick ourselves to get things going. */
 }
@@ -553,13 +544,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
 * Update CPU-local rcu_data state to record the newly noticed grace period.
 * This is used both when we started the grace period and when we notice
- * that someone else started the grace period.
+ * that someone else started the grace period.  The caller must hold the
+ * ->lock of the leaf rcu_node structure corresponding to the current CPU,
+ *  and must have irqs disabled.
 */
+static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+{
+        if (rdp->gpnum != rnp->gpnum) {
+                rdp->qs_pending = 1;
+                rdp->passed_quiesc = 0;
+                rdp->gpnum = rnp->gpnum;
+        }
+}
 static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        rdp->qs_pending = 1;
+        unsigned long flags;
-        rdp->passed_quiesc = 0;
+        struct rcu_node *rnp;
-        rdp->gpnum = rsp->gpnum;
+        local_irq_save(flags);
+        rnp = rdp->mynode;
+        if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
+            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+                local_irq_restore(flags);
+                return;
+        }
+        __note_new_gpnum(rsp, rnp, rdp);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -583,31 +594,59 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * Advance this CPU's callbacks, but only if the current grace period
- * in preparation for detecting the next grace period.  The caller must hold
+ * has ended.  This may be called only from the CPU to whom the rdp
- * the root node's ->lock, which is released before return.  Hard irqs must
+ * belongs.  In addition, the corresponding leaf rcu_node structure's
- * be disabled.
+ * ->lock must be held by the caller, with irqs disabled.
 */
 static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-        __releases(rcu_get_root(rsp)->lock)
 {
-        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        /* Did another grace period end? */
-        struct rcu_node *rnp = rcu_get_root(rsp);
+        if (rdp->completed != rnp->completed) {
+                /* Advance callbacks.  No harm if list empty. */
+                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+                rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+                /* Remember that we saw this grace-period completion. */
+                rdp->completed = rnp->completed;
+        }
+}
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+        unsigned long flags;
+        struct rcu_node *rnp;
-        if (!cpu_needs_another_gp(rsp, rdp)) {
+        local_irq_save(flags);
-                spin_unlock_irqrestore(&rnp->lock, flags);
+        rnp = rdp->mynode;
+        if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
+            !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+                local_irq_restore(flags);
                return;
        }
+        __rcu_process_gp_end(rsp, rnp, rdp);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
-        /* Advance to a new grace period and initialize state. */
+/*
-        rsp->gpnum++;
+ * Do per-CPU grace-period initialization for running CPU.  The caller
-        WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
+ * must hold the lock of the leaf rcu_node structure corresponding to
-        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+ * this CPU.
-        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+ */
-        record_gp_stall_check_time(rsp);
+static void
-        dyntick_record_completed(rsp, rsp->completed - 1);
+rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
-        note_new_gpnum(rsp, rdp);
+{
+        /* Prior grace period ended, so advance callbacks for current CPU. */
+        __rcu_process_gp_end(rsp, rnp, rdp);
        /*
         * Because this CPU just now started the new grace period, we know
@@ -623,21 +662,70 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
        rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        /* Set state so that this CPU will detect the next quiescent state. */
+        __note_new_gpnum(rsp, rnp, rdp);
+}
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+        __releases(rcu_get_root(rsp)->lock)
+{
+        struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+                if (cpu_needs_another_gp(rsp, rdp))
+                        rsp->fqs_need_gp = 1;
+                if (rnp->completed == rsp->completed) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        return;
+                }
+                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+                /*
+                 * Propagate new ->completed value to rcu_node structures
+                 * so that other CPUs don't have to wait until the start
+                 * of the next grace period to process their callbacks.
+                 */
+                rcu_for_each_node_breadth_first(rsp, rnp) {
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                        rnp->completed = rsp->completed;
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                }
+                local_irq_restore(flags);
+                return;
+        }
+        /* Advance to a new grace period and initialize state. */
+        rsp->gpnum++;
+        WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
+        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+        record_gp_stall_check_time(rsp);
        /* Special-case the common single-level case. */
        if (NUM_RCU_NODES == 1) {
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
+                rnp->completed = rsp->completed;
                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+        raw_spin_unlock(&rnp->lock);  /* leave irqs disabled. */
        /* Exclude any concurrent CPU-hotplug operations. */
-        spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
        /*
         * Set the quiescent-state-needed bits in all the rcu_node
@@ -657,73 +745,50 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
         * irqs disabled.
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
-                spin_lock(&rnp->lock);          /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
-                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
+                rnp->completed = rsp->completed;
+                if (rnp == rdp->mynode)
+                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
+                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
        rnp = rcu_get_root(rsp);
-        spin_lock(&rnp->lock);                  /* irqs already disabled. */
+        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
        rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
-        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
- * Advance this CPU's callbacks, but only if the current grace period
+ * Report a full set of quiescent states to the specified rcu_state
- * has ended.  This may be called only from the CPU to whom the rdp
+ * data structure.  This involves cleaning up after the prior grace
- * belongs.
+ * period and letting rcu_start_gp() start up the next grace period
+ * if one is needed.  Note that the caller must hold rnp->lock, as
+ * required by rcu_start_gp(), which will release it.
 */
-static void
+static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
-rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-        long completed_snap;
-        unsigned long flags;
-        local_irq_save(flags);
-        completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
-        /* Did another grace period end? */
-        if (rdp->completed != completed_snap) {
-                /* Advance callbacks.  No harm if list empty. */
-                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
-                rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
-                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-                /* Remember that we saw this grace-period completion. */
-                rdp->completed = completed_snap;
-        }
-        local_irq_restore(flags);
-}
-/*
- * Clean up after the prior grace period and let rcu_start_gp() start up
- * the next grace period if one is needed.  Note that the caller must
- * hold rnp->lock, as required by rcu_start_gp(), which will release it.
- */
-static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
        rsp->completed = rsp->gpnum;
        rsp->signaled = RCU_GP_IDLE;
-        rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
 /*
- * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * Similar to rcu_report_qs_rdp(), for which it is a helper function.
- * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * Allows quiescent states for a group of CPUs to be reported at one go
- * group must be represented by the same leaf rcu_node structure.
+ * to the specified rcu_node structure, though all the CPUs in the group
- * That structure's lock must be held upon entry, and it is released
+ * must be represented by the same rcu_node structure (which need not be
- * before return.
+ * a leaf rcu_node structure, though it often will be).  That structure's
+ * lock must be held upon entry, and it is released before return.
 */
 static void
-cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
-              unsigned long flags)
+                  struct rcu_node *rnp, unsigned long flags)
        __releases(rnp->lock)
 {
        struct rcu_node *rnp_c;
@@ -733,14 +798,14 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
                if (!(rnp->qsmask & mask)) {
                        /* Our bit has already been cleared, so done. */
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
                rnp->qsmask &= ~mask;
                if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
                        /* Other bits still set at this level, so done. */
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        return;
                }
                mask = rnp->grpmask;
@@ -750,54 +815,56 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
                        break;
                }
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                rnp_c = rnp;
                rnp = rnp->parent;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                WARN_ON_ONCE(rnp_c->qsmask);
        }
        /*
         * Get here if we are the last CPU to pass through a quiescent
-         * state for this grace period.  Invoke cpu_quiet_msk_finish()
+         * state for this grace period.  Invoke rcu_report_qs_rsp()
         * to clean up and start the next grace period if one is needed.
         */
-        cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
+        rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
 }
 /*
- * Record a quiescent state for the specified CPU, which must either be
+ * Record a quiescent state for the specified CPU to that CPU's rcu_data
- * the current CPU.  The lastcomp argument is used to make sure we are
+ * structure.  This must be either called from the specified CPU, or
- * still in the grace period of interest.  We don't want to end the current
+ * called when the specified CPU is known to be offline (and when it is
- * grace period based on quiescent states detected in an earlier grace
+ * also known that no other CPU is concurrently trying to help the offline
- * period!
+ * CPU).  The lastcomp argument is used to make sure we are still in the
+ * grace period of interest.  We don't want to end the current grace period
+ * based on quiescent states detected in an earlier grace period!
 */
 static void
-cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
 {
        unsigned long flags;
        unsigned long mask;
        struct rcu_node *rnp;
        rnp = rdp->mynode;
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (lastcomp != ACCESS_ONCE(rsp->completed)) {
+        if (lastcomp != rnp->completed) {
                /*
                 * Someone beat us to it for this grace period, so leave.
                 * The race with GP start is resolved by the fact that we
                 * hold the leaf rcu_node lock, so that the per-CPU bits
                 * cannot yet be initialized -- so we would simply find our
-                 * CPU's bit already cleared in cpu_quiet_msk() if this race
+                 * CPU's bit already cleared in rcu_report_qs_rnp() if this
-                 * occurred.
+                 * race occurred.
                 */
                rdp->passed_quiesc = 0; /* try again later! */
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
        mask = rdp->grpmask;
        if ((rnp->qsmask & mask) == 0) {
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else {
                rdp->qs_pending = 0;
@@ -807,7 +874,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
                 */
                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-                cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+                rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
        }
 }
@@ -838,8 +905,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
        if (!rdp->passed_quiesc)
                return;
-        /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+        /*
-        cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+         * Tell RCU we are done (but rcu_report_qs_rdp() will be the
+         * judge of that).
+         */
+        rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -858,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
        if (rdp->nxtlist == NULL)
                return;  /* irqs disabled, so comparison is stable. */
-        spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
        *rsp->orphan_cbs_tail = rdp->nxtlist;
        rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
        rdp->nxtlist = NULL;
@@ -866,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
                rdp->nxttail[i] = &rdp->nxtlist;
        rsp->orphan_qlen += rdp->qlen;
        rdp->qlen = 0;
-        spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
+        raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
 }
 /*
@@ -877,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        unsigned long flags;
        struct rcu_data *rdp;
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        rdp = rsp->rda[smp_processor_id()];
        if (rsp->orphan_cbs_list == NULL) {
-                spin_unlock_irqrestore(&rsp->onofflock, flags);
+                raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
                return;
        }
        *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -889,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        rsp->orphan_cbs_list = NULL;
        rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
        rsp->orphan_qlen = 0;
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 /*
@@ -899,45 +969,47 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
-        long lastcomp;
        unsigned long mask;
+        int need_report = 0;
        struct rcu_data *rdp = rsp->rda[cpu];
        struct rcu_node *rnp;
        /* Exclude any attempts to start a new grace period. */
-        spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
        rnp = rdp->mynode;      /* this is the outgoing CPU's rnp. */
        mask = rdp->grpmask;    /* rnp->grplo is constant. */
        do {
-                spin_lock(&rnp->lock);          /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rnp->qsmaskinit &= ~mask;
                if (rnp->qsmaskinit != 0) {
-                        spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        if (rnp != rdp->mynode)
+                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                        break;
                }
+                if (rnp == rdp->mynode)
-                /*
+                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                 * If there was a task blocking the current grace period,
+                else
-                 * and if all CPUs have checked in, we need to propagate
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                 * the quiescent state up the rcu_node hierarchy.  But that
-                 * is inconvenient at the moment due to deadlock issues if
-                 * this should end the current grace period.  So set the
-                 * offlined CPU's bit in ->qsmask in order to force the
-                 * next force_quiescent_state() invocation to clean up this
-                 * mess in a deadlock-free manner.
-                 */
-                if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
-                        rnp->qsmask |= mask;
                mask = rnp->grpmask;
-                spin_unlock(&rnp->lock);        /* irqs remain disabled. */
                rnp = rnp->parent;
        } while (rnp != NULL);
-        lastcomp = rsp->completed;
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        /*
+         * We still hold the leaf rcu_node structure lock here, and
+         * irqs are still disabled.  The reason for this subterfuge is
+         * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
+         * held leads to deadlock.
+         */
+        raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+        rnp = rdp->mynode;
+        if (need_report & RCU_OFL_TASKS_NORM_GP)
+                rcu_report_unblock_qs_rnp(rnp, flags);
+        else
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        if (need_report & RCU_OFL_TASKS_EXP_GP)
+                rcu_report_exp_rnp(rsp, rnp);
        rcu_adopt_orphan_cbs(rsp);
 }
@@ -1094,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
 /*
 * Scan the leaf rcu_node structures, processing dyntick state for any that
 * have not yet encountered a quiescent state, using the function specified.
- * Returns 1 if the current grace period ends while scanning (possibly
+ * The caller must have suppressed start of new grace periods.
- * because we made it end).
 */
-static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
-                               int (*f)(struct rcu_data *))
 {
        unsigned long bit;
        int cpu;
@@ -1108,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
        rcu_for_each_leaf_node(rsp, rnp) {
                mask = 0;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
-                if (rsp->completed != lastcomp) {
+                if (!rcu_gp_in_progress(rsp)) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        return 1;
+                        return;
                }
                if (rnp->qsmask == 0) {
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
                        continue;
                }
                cpu = rnp->grplo;
@@ -1123,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
                        if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
                                mask |= bit;
                }
-                if (mask != 0 && rsp->completed == lastcomp) {
+                if (mask != 0) {
-                        /* cpu_quiet_msk() releases rnp->lock. */
+                        /* rcu_report_qs_rnp() releases rnp->lock. */
-                        cpu_quiet_msk(mask, rsp, rnp, flags);
+                        rcu_report_qs_rnp(mask, rsp, rnp, flags);
                        continue;
                }
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
-        return 0;
 }
 /*
@@ -1141,31 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 {
        unsigned long flags;
-        long lastcomp;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        u8 signaled;
        if (!rcu_gp_in_progress(rsp))
                return;  /* No grace period in progress, nothing to force. */
-        if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+        if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
                return; /* Someone else is already on the job. */
        }
-        if (relaxed &&
+        if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
-            (long)(rsp->jiffies_force_qs - jiffies) >= 0)
+                goto unlock_fqs_ret; /* no emergency and done recently. */
-                goto unlock_ret; /* no emergency and done recently. */
        rsp->n_force_qs++;
-        spin_lock(&rnp->lock);
+        raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-        lastcomp = rsp->completed;
-        signaled = rsp->signaled;
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
-        if (lastcomp == rsp->gpnum) {
+        if(!rcu_gp_in_progress(rsp)) {
                rsp->n_force_qs_ngp++;
-                spin_unlock(&rnp->lock);
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-                goto unlock_ret;  /* no GP in progress, time updated. */
+                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
-        spin_unlock(&rnp->lock);
+        rsp->fqs_active = 1;
-        switch (signaled) {
+        switch (rsp->signaled) {
        case RCU_GP_IDLE:
        case RCU_GP_INIT:
@@ -1173,37 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
        case RCU_SAVE_DYNTICK:
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
                if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
                        break; /* So gcc recognizes the dead code. */
                /* Record dyntick-idle state. */
-                if (rcu_process_dyntick(rsp, lastcomp,
+                force_qs_rnp(rsp, dyntick_save_progress_counter);
-                                        dyntick_save_progress_counter))
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-                        goto unlock_ret;
+                if (rcu_gp_in_progress(rsp))
-                /* Update state, record completion counter. */
-                spin_lock(&rnp->lock);
-                if (lastcomp == rsp->completed &&
-                    rsp->signaled == RCU_SAVE_DYNTICK) {
                        rsp->signaled = RCU_FORCE_QS;
-                        dyntick_record_completed(rsp, lastcomp);
-                }
-                spin_unlock(&rnp->lock);
                break;
        case RCU_FORCE_QS:
                /* Check dyntick-idle state, send IPI to laggarts. */
-                if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
-                                        rcu_implicit_dynticks_qs))
+                force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
-                        goto unlock_ret;
                /* Leave state in case more forcing is required. */
+                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                break;
        }
-unlock_ret:
+        rsp->fqs_active = 0;
-        spin_unlock_irqrestore(&rsp->fqslock, flags);
+        if (rsp->fqs_need_gp) {
+                raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
+                rsp->fqs_need_gp = 0;
+                rcu_start_gp(rsp, flags); /* releases rnp->lock */
+                return;
+        }
+        raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+unlock_fqs_ret:
+        raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
 }
 #else /* #ifdef CONFIG_SMP */
@@ -1231,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
         * If an RCU GP has gone long enough, go check for dyntick
         * idle CPUs and, if needed, send resched IPIs.
         */
-        if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+        if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        /*
@@ -1245,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
        /* Does this CPU require a not-yet-started grace period? */
        if (cpu_needs_another_gp(rsp, rdp)) {
-                spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+                raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
                rcu_start_gp(rsp, flags);  /* releases above lock */
        }
@@ -1276,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
         * grace-period manipulations above.
         */
        smp_mb(); /* See above block comment. */
+        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
+        rcu_needs_cpu_flush();
 }
 static void
@@ -1310,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                unsigned long nestflag;
                struct rcu_node *rnp_root = rcu_get_root(rsp);
-                spin_lock_irqsave(&rnp_root->lock, nestflag);
+                raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
                rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
        }
@@ -1328,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                        force_quiescent_state(rsp, 0);
                rdp->n_force_qs_snap = rsp->n_force_qs;
                rdp->qlen_last_fqs_check = rdp->qlen;
-        } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
 }
@@ -1351,6 +1419,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+/**
+ * synchronize_sched - wait until an rcu-sched grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu-sched
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-sched read-side critical sections have completed.   These read-side
+ * critical sections are delimited by rcu_read_lock_sched() and
+ * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
+ * local_irq_disable(), and so on may be used in place of
+ * rcu_read_lock_sched().
+ *
+ * This means that all preempt_disable code sequences, including NMI and
+ * hardware-interrupt handlers, in progress on entry will have completed
+ * before this primitive returns.  However, this does not guarantee that
+ * softirq handlers will have completed, since in some kernels, these
+ * handlers can run in process context, and can block.
+ *
+ * This primitive provides the guarantees made by the (now removed)
+ * synchronize_kernel() API.  In contrast, synchronize_rcu() only
+ * guarantees that rcu_read_lock() sections will have completed.
+ * In "classic RCU", these two guarantees happen to be one and
+ * the same, but can differ in realtime RCU implementations.
+ */
+void synchronize_sched(void)
+{
+        struct rcu_synchronize rcu;
+        if (rcu_blocking_is_gp())
+                return;
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu_sched(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_sched);
+/**
+ * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full rcu_bh grace
+ * period has elapsed, in other words after all currently executing rcu_bh
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
+ * and may be nested.
+ */
+void synchronize_rcu_bh(void)
+{
+        struct rcu_synchronize rcu;
+        if (rcu_blocking_is_gp())
+                return;
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu_bh(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 /*
 * Check to see if there is any immediate RCU-related work to be done
 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1360,6 +1490,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 */
 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 {
+        struct rcu_node *rnp = rdp->mynode;
        rdp->n_rcu_pending++;
        /* Check for CPU stalls, if enabled. */
@@ -1384,20 +1516,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        /* Has another RCU grace period completed?  */
-        if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+        if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
                rdp->n_rp_gp_completed++;
                return 1;
        }
        /* Has a new RCU grace period started? */
-        if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+        if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
                rdp->n_rp_gp_started++;
                return 1;
        }
        /* Has an RCU GP gone long enough to send resched IPIs &c? */
        if (rcu_gp_in_progress(rsp) &&
-            ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+            ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
                rdp->n_rp_need_fqs++;
                return 1;
        }
@@ -1422,10 +1554,9 @@ static int rcu_pending(int cpu)
 /*
 * Check to see if any future RCU-related work will need to be done
 * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * 1 if so.
- * an exported member of the RCU API.
 */
-int rcu_needs_cpu(int cpu)
+static int rcu_needs_cpu_quick_check(int cpu)
 {
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1521,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1531,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 #endif /* #ifdef CONFIG_NO_HZ */
        rdp->cpu = cpu;
-        spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 /*
@@ -1544,25 +1675,20 @@ static void __cpuinit
 rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 {
        unsigned long flags;
-        long lastcomp;
        unsigned long mask;
        struct rcu_data *rdp = rsp->rda[cpu];
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
-        spin_lock_irqsave(&rnp->lock, flags);
+        raw_spin_lock_irqsave(&rnp->lock, flags);
-        lastcomp = rsp->completed;
-        rdp->completed = lastcomp;
-        rdp->gpnum = lastcomp;
        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->preemptable = preemptable;
-        rdp->passed_quiesc_completed = lastcomp - 1;
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
-        spin_unlock(&rnp->lock);                /* irqs remain disabled. */
+        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        /*
         * A new grace period might start here.  If so, we won't be part
@@ -1570,21 +1696,26 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
         */
        /* Exclude any attempts to start a new GP on large systems. */
-        spin_lock(&rsp->onofflock);             /* irqs already disabled. */
+        raw_spin_lock(&rsp->onofflock);         /* irqs already disabled. */
        /* Add CPU to rcu_node bitmasks. */
        rnp = rdp->mynode;
        mask = rdp->grpmask;
        do {
                /* Exclude any attempts to start a new GP on small systems. */
-                spin_lock(&rnp->lock);  /* irqs already disabled. */
+                raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
                rnp->qsmaskinit |= mask;
                mask = rnp->grpmask;
-                spin_unlock(&rnp->lock); /* irqs already disabled. */
+                if (rnp == rdp->mynode) {
+                        rdp->gpnum = rnp->completed; /* if GP in progress... */
+                        rdp->completed = rnp->completed;
+                        rdp->passed_quiesc_completed = rnp->completed - 1;
+                }
+                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
        } while (rnp != NULL && !(rnp->qsmaskinit & mask));
-        spin_unlock_irqrestore(&rsp->onofflock, flags);
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -1597,8 +1728,8 @@ static void __cpuinit rcu_online_cpu(int cpu)
 /*
 * Handle CPU online/offline notification events.
 */
-int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-                             unsigned long action, void *hcpu)
+                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1668,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 */
 static void __init rcu_init_one(struct rcu_state *rsp)
 {
+        static char *buf[] = { "rcu_node_level_0",
+                               "rcu_node_level_1",
+                               "rcu_node_level_2",
+                               "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
        int cpustride = 1;
        int i;
        int j;
        struct rcu_node *rnp;
+        BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
        /* Initialize the level-tracking arrays. */
        for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1685,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                cpustride *= rsp->levelspread[i];
                rnp = rsp->level[i];
                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
-                        if (rnp != rcu_get_root(rsp))
+                        raw_spin_lock_init(&rnp->lock);
-                                spin_lock_init(&rnp->lock);
+                        lockdep_set_class_and_name(&rnp->lock,
+                                                   &rcu_node_class[i], buf[i]);
                        rnp->gpnum = 0;
                        rnp->qsmask = 0;
                        rnp->qsmaskinit = 0;
@@ -1707,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                        rnp->level = i;
                        INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
                        INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
+                        INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
+                        INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
                }
        }
-        spin_lock_init(&rcu_get_root(rsp)->lock);
 }
 /*
@@ -1735,16 +1874,30 @@ do { \
        } \
 } while (0)
-void __init __rcu_init(void)
+void __init rcu_init(void)
 {
+        int cpu;
        rcu_bootup_announce();
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
        printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#if NUM_RCU_LVL_4 != 0
+        printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
+#endif /* #if NUM_RCU_LVL_4 != 0 */
        RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
        RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
        __rcu_init_preempt();
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+        /*
+         * We don't need protection against CPU-hotplug here because
+         * this is called early in boot, before either interrupts
+         * or the scheduler are operational.
+         */
+        cpu_notifier(rcu_cpu_notify, 0);
+        for_each_online_cpu(cpu)
+                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 }
 #include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1899023b0962..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -34,10 +34,11 @@
 * In practice, this has not been tested, so there is probably some
 * bug somewhere.
 */
-#define MAX_RCU_LVLS 3
+#define MAX_RCU_LVLS 4
 #define RCU_FANOUT            (CONFIG_RCU_FANOUT)
 #define RCU_FANOUT_SQ         (RCU_FANOUT * RCU_FANOUT)
 #define RCU_FANOUT_CUBE       (RCU_FANOUT_SQ * RCU_FANOUT)
+#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
 #if NR_CPUS <= RCU_FANOUT
 #  define NUM_RCU_LVLS        1
@@ -45,23 +46,33 @@
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
+#  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_SQ
 #  define NUM_RCU_LVLS        2
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
+#  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_CUBE
 #  define NUM_RCU_LVLS        3
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
 #  define NUM_RCU_LVL_3       NR_CPUS
+#  define NUM_RCU_LVL_4       0
+#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#  define NUM_RCU_LVLS        4
+#  define NUM_RCU_LVL_0       1
+#  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
+#  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_3       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_4       NR_CPUS
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT */
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
 /*
@@ -79,9 +90,12 @@ struct rcu_dynticks {
 * Definition for node within the RCU grace-period-detection hierarchy.
 */
 struct rcu_node {
-        spinlock_t lock;        /* Root rcu_node's lock protects some */
+        raw_spinlock_t lock;    /* Root rcu_node's lock protects some */
                                /*  rcu_state fields as well as following. */
-        long    gpnum;          /* Current grace period for this node. */
+        unsigned long gpnum;    /* Current grace period for this node. */
+                                /*  This will either be equal to or one */
+                                /*  behind the root rcu_node's gpnum. */
+        unsigned long completed; /* Last GP completed for this node. */
                                /*  This will either be equal to or one */
                                /*  behind the root rcu_node's gpnum. */
        unsigned long qsmask;   /* CPUs or groups that need to switch in */
@@ -90,8 +104,12 @@ struct rcu_node {
                                /*  an rcu_data structure, otherwise, each */
                                /*  bit corresponds to a child rcu_node */
                                /*  structure. */
+        unsigned long expmask;  /* Groups that have ->blocked_tasks[] */
+                                /*  elements that need to drain to allow the */
+                                /*  current expedited grace period to */
+                                /*  complete (only for TREE_PREEMPT_RCU). */
        unsigned long qsmaskinit;
-                                /* Per-GP initialization for qsmask. */
+                                /* Per-GP initial value for qsmask & expmask. */
        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
                                /*  Only one bit will be set in this mask. */
        int     grplo;          /* lowest-numbered CPU or group here. */
@@ -99,7 +117,7 @@ struct rcu_node {
        u8      grpnum;         /* CPU/group number for next level up. */
        u8      level;          /* root is at level 0. */
        struct rcu_node *parent;
-        struct list_head blocked_tasks[2];
+        struct list_head blocked_tasks[4];
                                /* Tasks blocked in RCU read-side critsect. */
                                /*  Grace period number (->gpnum) x blocked */
                                /*  by tasks on the (x & 0x1) element of the */
@@ -114,6 +132,21 @@ struct rcu_node {
        for ((rnp) = &(rsp)->node[0]; \
             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+/*
+ * Do a breadth-first scan of the non-leaf rcu_node structures for the
+ * specified rcu_state structure.  Note that if there is a singleton
+ * rcu_node tree with but one rcu_node structure, this loop is a no-op.
+ */
+#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+        for ((rnp) = &(rsp)->node[0]; \
+             (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+/*
+ * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
+ * structure.  Note that if there is a singleton rcu_node tree with but
+ * one rcu_node structure, this loop -will- visit the rcu_node structure.
+ * It is still a leaf node, even if it is also the root node.
+ */
 #define rcu_for_each_leaf_node(rsp, rnp) \
        for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
@@ -128,11 +161,11 @@ struct rcu_node {
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
        /* 1) quiescent-state and grace-period handling : */
-        long            completed;      /* Track rsp->completed gp number */
+        unsigned long   completed;      /* Track rsp->completed gp number */
                                        /*  in order to detect GP end. */
-        long            gpnum;          /* Highest gp number that this CPU */
+        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
-        long            passed_quiesc_completed;
+        unsigned long   passed_quiesc_completed;
                                        /* Value of completed at time of qs. */
        bool            passed_quiesc;  /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
@@ -188,14 +221,14 @@ struct rcu_data {
        unsigned long resched_ipi;      /* Sent a resched IPI. */
        /* 5) __rcu_pending() statistics. */
-        long n_rcu_pending;             /* rcu_pending() calls since boot. */
+        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
-        long n_rp_qs_pending;
+        unsigned long n_rp_qs_pending;
-        long n_rp_cb_ready;
+        unsigned long n_rp_cb_ready;
-        long n_rp_cpu_needs_gp;
+        unsigned long n_rp_cpu_needs_gp;
-        long n_rp_gp_completed;
+        unsigned long n_rp_gp_completed;
-        long n_rp_gp_started;
+        unsigned long n_rp_gp_started;
-        long n_rp_need_fqs;
+        unsigned long n_rp_need_fqs;
-        long n_rp_need_nothing;
+        unsigned long n_rp_need_nothing;
        int cpu;
 };
@@ -213,15 +246,27 @@ struct rcu_data {
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_RAT_DELAY             2         /* Allow other CPUs time */
+#define RCU_STALL_DELAY_DELTA          (5 * HZ)
-                                                  /*  to take at least one */
+#else
-                                                  /*  scheduling clock irq */
+#define RCU_STALL_DELAY_DELTA          0
-                                                  /*  before ratting on them. */
+#endif
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
+                                                /*  to take at least one */
+                                                /*  scheduling clock irq */
+                                                /*  before ratting on them. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#define ULONG_CMP_GE(a, b)      (ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b)      (ULONG_MAX / 2 < (a) - (b))
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
 * represented in "heap" form in a dense array.  The root (first level)
@@ -243,12 +288,19 @@ struct rcu_state {
        u8      signaled ____cacheline_internodealigned_in_smp;
                                                /* Force QS state. */
-        long    gpnum;                          /* Current gp number. */
+        u8      fqs_active;                     /* force_quiescent_state() */
-        long    completed;                      /* # of last completed gp. */
+                                                /*  is running. */
+        u8      fqs_need_gp;                    /* A CPU was prevented from */
+                                                /*  starting a new grace */
+                                                /*  period because */
+                                                /*  force_quiescent_state() */
+                                                /*  was running. */
+        unsigned long gpnum;                    /* Current gp number. */
+        unsigned long completed;                /* # of last completed gp. */
-        /* End  of fields guarded by root rcu_node's lock. */
+        /* End of fields guarded by root rcu_node's lock. */
-        spinlock_t onofflock;                   /* exclude on/offline and */
+        raw_spinlock_t onofflock;               /* exclude on/offline and */
                                                /*  starting new GP.  Also */
                                                /*  protects the following */
                                                /*  orphan_cbs fields. */
@@ -258,7 +310,7 @@ struct rcu_state {
                                                /*  going offline. */
        struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
        long orphan_qlen;                       /* Number of orphaned cbs. */
-        spinlock_t fqslock;                     /* Only one task forcing */
+        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
@@ -274,12 +326,14 @@ struct rcu_state {
        unsigned long jiffies_stall;            /* Time at which to check */
                                                /*  for CPU stalls. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#ifdef CONFIG_NO_HZ
-        long dynticks_completed;                /* Value of completed @ snap. */
-#endif /* #ifdef CONFIG_NO_HZ */
 };
-#ifdef RCU_TREE_NONCORE
+/* Return values for rcu_preempt_offline_tasks(). */
+#define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
+                                                /*  GP were moved to root. */
+#define RCU_OFL_TASKS_EXP_GP    0x2             /* Tasks blocking expedited */
+                                                /*  GP were moved to root. */
 /*
 * RCU implementation internal declarations:
@@ -295,14 +349,19 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#else /* #ifdef RCU_TREE_NONCORE */
+#ifndef RCU_TREE_NONCORE
 /* Forward declarations for rcutree_plugin.h */
-static inline void rcu_bootup_announce(void);
+static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempted_readers(struct rcu_node *rnp);
+#ifdef CONFIG_HOTPLUG_CPU
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
+                                      unsigned long flags);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -315,10 +374,14 @@ static void rcu_preempt_offline_cpu(int cpu);
 static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
+#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_orphanage(void);
 static void __init __rcu_init_preempt(void);
+static void rcu_needs_cpu_flush(void);
-#endif /* #else #ifdef RCU_TREE_NONCORE */
+#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ef2a58c2b9d5..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -24,16 +24,19 @@
 *         Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 */
+#include <linux/delay.h>
 #ifdef CONFIG_TREE_PREEMPT_RCU
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 /*
 * Tell them what RCU they are running.
 */
-static inline void rcu_bootup_announce(void)
+static void __init rcu_bootup_announce(void)
 {
        printk(KERN_INFO
               "Experimental preemptable hierarchical RCU implementation.\n");
@@ -59,6 +62,15 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
+ * Force a quiescent state for preemptible RCU.
+ */
+void rcu_force_quiescent_state(void)
+{
+        force_quiescent_state(&rcu_preempt_state, 0);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+/*
 * Record a preemptable-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
@@ -67,7 +79,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
 static void rcu_preempt_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-        rdp->passed_quiesc_completed = rdp->completed;
+        rdp->passed_quiesc_completed = rdp->gpnum - 1;
        barrier();
        rdp->passed_quiesc = 1;
 }
@@ -99,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                /* Possibly blocking in an RCU read-side critical section. */
                rdp = rcu_preempt_state.rda[cpu];
                rnp = rdp->mynode;
-                spin_lock_irqsave(&rnp->lock, flags);
+                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
                t->rcu_blocked_node = rnp;
@@ -120,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
                WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
                phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
                list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
-                spin_unlock_irqrestore(&rnp->lock, flags);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        }
        /*
@@ -157,14 +169,58 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
 */
 static int rcu_preempted_readers(struct rcu_node *rnp)
 {
-        return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+        int phase = rnp->gpnum & 0x1;
+        return !list_empty(&rnp->blocked_tasks[phase]) ||
+               !list_empty(&rnp->blocked_tasks[phase + 2]);
 }
+/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period.  The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+        __releases(rnp->lock)
+{
+        unsigned long mask;
+        struct rcu_node *rnp_p;
+        if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;  /* Still need more quiescent states! */
+        }
+        rnp_p = rnp->parent;
+        if (rnp_p == NULL) {
+                /*
+                 * Either there is only one rcu_node in the tree,
+                 * or tasks were kicked up to root rcu_node due to
+                 * CPUs going offline.
+                 */
+                rcu_report_qs_rsp(&rcu_preempt_state, flags);
+                return;
+        }
+        /* Report up the rest of the hierarchy. */
+        mask = rnp->grpmask;
+        raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
+        raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
+        rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
+}
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
 static void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
+        int empty_exp;
        unsigned long flags;
-        unsigned long mask;
        struct rcu_node *rnp;
        int special;
@@ -201,42 +257,36 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 */
                for (;;) {
                        rnp = t->rcu_blocked_node;
-                        spin_lock(&rnp->lock);  /* irqs already disabled. */
+                        raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
                        if (rnp == t->rcu_blocked_node)
                                break;
-                        spin_unlock(&rnp->lock);  /* irqs remain disabled. */
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                }
                empty = !rcu_preempted_readers(rnp);
+                empty_exp = !rcu_preempted_readers_exp(rnp);
+                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                list_del_init(&t->rcu_node_entry);
                t->rcu_blocked_node = NULL;
                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
-                 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
+                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
-                 * drop rnp->lock and restore irq.
                 */
-                if (!empty && rnp->qsmask == 0 &&
+                if (empty)
-                    !rcu_preempted_readers(rnp)) {
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        struct rcu_node *rnp_p;
+                else
+                        rcu_report_unblock_qs_rnp(rnp, flags);
-                        if (rnp->parent == NULL) {
-                                /* Only one rcu_node in the tree. */
+                /*
-                                cpu_quiet_msk_finish(&rcu_preempt_state, flags);
+                 * If this was the last task on the expedited lists,
-                                return;
+                 * then we need to report up the rcu_node hierarchy.
-                        }
+                 */
-                        /* Report up the rest of the hierarchy. */
+                if (!empty_exp && !rcu_preempted_readers_exp(rnp))
-                        mask = rnp->grpmask;
+                        rcu_report_exp_rnp(&rcu_preempt_state, rnp);
-                        spin_unlock_irqrestore(&rnp->lock, flags);
+        } else {
-                        rnp_p = rnp->parent;
+                local_irq_restore(flags);
-                        spin_lock_irqsave(&rnp_p->lock, flags);
-                        WARN_ON_ONCE(rnp->qsmask);
-                        cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
-                        return;
-                }
-                spin_unlock(&rnp->lock);
        }
-        local_irq_restore(flags);
 }
 /*
@@ -254,29 +304,73 @@ void __rcu_read_unlock(void)
        if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
            unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+        WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+        unsigned long flags;
+        struct list_head *lp;
+        int phase;
+        struct task_struct *t;
+        if (rcu_preempted_readers(rnp)) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                phase = rnp->gpnum & 0x1;
+                lp = &rnp->blocked_tasks[phase];
+                list_for_each_entry(t, lp, rcu_node_entry)
+                        sched_show_task(t);
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        }
+}
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period.
+ */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        rcu_print_detail_task_stall_rnp(rnp);
+        rcu_for_each_leaf_node(rsp, rnp)
+                rcu_print_detail_task_stall_rnp(rnp);
+}
+#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
 /*
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
 */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
-        unsigned long flags;
        struct list_head *lp;
        int phase;
        struct task_struct *t;
        if (rcu_preempted_readers(rnp)) {
-                spin_lock_irqsave(&rnp->lock, flags);
                phase = rnp->gpnum & 0x1;
                lp = &rnp->blocked_tasks[phase];
                list_for_each_entry(t, lp, rcu_node_entry)
                        printk(" P%d", t->pid);
-                spin_unlock_irqrestore(&rnp->lock, flags);
        }
 }
@@ -303,6 +397,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 * rcu_node.  The reason for not just moving them to the immediate
 * parent is to remove the need for rcu_read_unlock_special() to
 * make more than two attempts to acquire the target rcu_node's lock.
+ * Returns true if there were tasks blocking the current RCU grace
+ * period.
 *
 * Returns 1 if there was previously a task blocking the current grace
 * period on the specified rcu_node structure.
@@ -316,7 +412,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
        int i;
        struct list_head *lp;
        struct list_head *lp_root;
-        int retval = rcu_preempted_readers(rnp);
+        int retval = 0;
        struct rcu_node *rnp_root = rcu_get_root(rsp);
        struct task_struct *tp;
@@ -326,7 +422,9 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
        }
        WARN_ON_ONCE(rnp != rdp->mynode &&
                     (!list_empty(&rnp->blocked_tasks[0]) ||
-                      !list_empty(&rnp->blocked_tasks[1])));
+                      !list_empty(&rnp->blocked_tasks[1]) ||
+                      !list_empty(&rnp->blocked_tasks[2]) ||
+                      !list_empty(&rnp->blocked_tasks[3])));
        /*
         * Move tasks up to root rcu_node.  Rely on the fact that the
@@ -334,19 +432,22 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
         * rcu_nodes in terms of gp_num value.  This fact allows us to
         * move the blocked_tasks[] array directly, element by element.
         */
-        for (i = 0; i < 2; i++) {
+        if (rcu_preempted_readers(rnp))
+                retval |= RCU_OFL_TASKS_NORM_GP;
+        if (rcu_preempted_readers_exp(rnp))
+                retval |= RCU_OFL_TASKS_EXP_GP;
+        for (i = 0; i < 4; i++) {
                lp = &rnp->blocked_tasks[i];
                lp_root = &rnp_root->blocked_tasks[i];
                while (!list_empty(lp)) {
                        tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
-                        spin_lock(&rnp_root->lock); /* irqs already disabled */
+                        raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
                        list_del(&tp->rcu_node_entry);
                        tp->rcu_blocked_node = rnp_root;
                        list_add(&tp->rcu_node_entry, lp_root);
-                        spin_unlock(&rnp_root->lock); /* irqs remain disabled */
+                        raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
                }
        }
        return retval;
 }
@@ -398,14 +499,183 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+        struct rcu_synchronize rcu;
+        if (!rcu_scheduler_active)
+                return;
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        call_rcu(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(struct rcu_node *rnp)
+{
+        return !list_empty(&rnp->blocked_tasks[2]) ||
+               !list_empty(&rnp->blocked_tasks[3]);
+}
+/*
+ * return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period.  Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+        return !rcu_preempted_readers_exp(rnp) &&
+               ACCESS_ONCE(rnp->expmask) == 0;
+}
 /*
- * Wait for an rcu-preempt grace period.  We are supposed to expedite the
+ * Report the exit from RCU read-side critical section for the last task
- * grace period, but this is the crude slow compatability hack, so just
+ * that queued itself during or before the current expedited preemptible-RCU
- * invoke synchronize_rcu().
+ * grace period.  This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree.  (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+        unsigned long flags;
+        unsigned long mask;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        for (;;) {
+                if (!sync_rcu_preempt_exp_done(rnp))
+                        break;
+                if (rnp->parent == NULL) {
+                        wake_up(&sync_rcu_preempt_exp_wq);
+                        break;
+                }
+                mask = rnp->grpmask;
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+                rnp = rnp->parent;
+                raw_spin_lock(&rnp->lock); /* irqs already disabled */
+                rnp->expmask &= ~mask;
+        }
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure.  If there are no such
+ * tasks, report it up the rcu_node hierarchy.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
+ */
+static void
+sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+        int must_wait;
+        raw_spin_lock(&rnp->lock); /* irqs already disabled */
+        list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
+        list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
+        must_wait = rcu_preempted_readers_exp(rnp);
+        raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+        if (!must_wait)
+                rcu_report_exp_rnp(rsp, rnp);
+}
+/*
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to invoke synchronize_sched_expedited() to push all the tasks to
+ * the ->blocked_tasks[] lists, move all entries from the first set of
+ * ->blocked_tasks[] lists to the second set, and finally wait for this
+ * second set to drain.
 */
 void synchronize_rcu_expedited(void)
 {
-        synchronize_rcu();
+        unsigned long flags;
+        struct rcu_node *rnp;
+        struct rcu_state *rsp = &rcu_preempt_state;
+        long snap;
+        int trycount = 0;
+        smp_mb(); /* Caller's modifications seen first by other CPUs. */
+        snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+        smp_mb(); /* Above access cannot bleed into critical section. */
+        /*
+         * Acquire lock, falling back to synchronize_rcu() if too many
+         * lock-acquisition failures.  Of course, if someone does the
+         * expedited grace period for us, just leave.
+         */
+        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
+                if (trycount++ < 10)
+                        udelay(trycount * num_online_cpus());
+                else {
+                        synchronize_rcu();
+                        return;
+                }
+                if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
+                        goto mb_ret; /* Others did our work for us. */
+        }
+        if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
+                goto unlock_mb_ret; /* Others did our work for us. */
+        /* force all RCU readers onto blocked_tasks[]. */
+        synchronize_sched_expedited();
+        raw_spin_lock_irqsave(&rsp->onofflock, flags);
+        /* Initialize ->expmask for all non-leaf rcu_node structures. */
+        rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                rnp->expmask = rnp->qsmaskinit;
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+        }
+        /* Snapshot current state of ->blocked_tasks[] lists. */
+        rcu_for_each_leaf_node(rsp, rnp)
+                sync_rcu_preempt_exp_init(rsp, rnp);
+        if (NUM_RCU_NODES > 1)
+                sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+        /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
+        rnp = rcu_get_root(rsp);
+        wait_event(sync_rcu_preempt_exp_wq,
+                   sync_rcu_preempt_exp_done(rnp));
+        /* Clean up and exit. */
+        smp_mb(); /* ensure expedited GP seen before counter increment. */
+        ACCESS_ONCE(sync_rcu_preempt_exp_count)++;
+unlock_mb_ret:
+        mutex_unlock(&sync_rcu_preempt_exp_mutex);
+mb_ret:
+        smp_mb(); /* ensure subsequent action seen after grace period. */
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -481,7 +751,7 @@ void exit_rcu(void)
 /*
 * Tell them what RCU they are running.
 */
-static inline void rcu_bootup_announce(void)
+static void __init rcu_bootup_announce(void)
 {
        printk(KERN_INFO "Hierarchical RCU implementation.\n");
 }
@@ -496,6 +766,16 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
+ * Force a quiescent state for RCU, which, because there is no preemptible
+ * RCU, becomes the same as rcu-sched.
+ */
+void rcu_force_quiescent_state(void)
+{
+        rcu_sched_force_quiescent_state();
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+/*
 * Because preemptable RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
@@ -512,12 +792,30 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
        return 0;
 }
+#ifdef CONFIG_HOTPLUG_CPU
+/* Because preemptible RCU does not exist, no quieting of tasks. */
+static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+{
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 /*
 * Because preemptable RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
+static void rcu_print_detail_task_stall(struct rcu_state *rsp)
+{
+}
+/*
+ * Because preemptable RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
 static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 }
@@ -594,6 +892,20 @@ void synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Because preemptable RCU does not exist, there is never any need to
+ * report on tasks preempted in RCU read-side critical sections during
+ * expedited RCU grace periods.
+ */
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+        return;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
 * Because preemptable RCU does not exist, it never has any work to do.
 */
@@ -643,3 +955,115 @@ static void __init __rcu_init_preempt(void)
 }
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#if !defined(CONFIG_RCU_FAST_NO_HZ)
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we have preemptible RCU, just check whether this CPU needs
+ * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
+ * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        return rcu_needs_cpu_quick_check(cpu);
+}
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
+ * entry is not configured, so we never do need to.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+}
+#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
+#define RCU_NEEDS_CPU_FLUSHES 5
+static DEFINE_PER_CPU(int, rcu_dyntick_drain);
+static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ *
+ * Because we are not supporting preemptible RCU, attempt to accelerate
+ * any current grace periods so that RCU no longer needs this CPU, but
+ * only if all other CPUs are already in dynticks-idle mode.  This will
+ * allow the CPU cores to be powered down immediately, as opposed to after
+ * waiting many milliseconds for grace periods to elapse.
+ *
+ * Because it is not legal to invoke rcu_process_callbacks() with irqs
+ * disabled, we do one pass of force_quiescent_state(), then do a
+ * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
+ * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ */
+int rcu_needs_cpu(int cpu)
+{
+        int c = 0;
+        int thatcpu;
+        /* Check for being in the holdoff period. */
+        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+                return rcu_needs_cpu_quick_check(cpu);
+        /* Don't bother unless we are the last non-dyntick-idle CPU. */
+        for_each_cpu_not(thatcpu, nohz_cpu_mask)
+                if (thatcpu != cpu) {
+                        per_cpu(rcu_dyntick_drain, cpu) = 0;
+                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                        return rcu_needs_cpu_quick_check(cpu);
+                }
+        /* Check and update the rcu_dyntick_drain sequencing. */
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+                /* First time through, initialize the counter. */
+                per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+                /* We have hit the limit, so time to give up. */
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                return rcu_needs_cpu_quick_check(cpu);
+        }
+        /* Do one step pushing remaining RCU callbacks through. */
+        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+                rcu_sched_qs(cpu);
+                force_quiescent_state(&rcu_sched_state, 0);
+                c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+        }
+        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+                rcu_bh_qs(cpu);
+                force_quiescent_state(&rcu_bh_state, 0);
+                c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+        }
+        /* If RCU callbacks are still pending, RCU still needs this CPU. */
+        if (c)
+                raise_softirq(RCU_SOFTIRQ);
+        return c;
+}
+/*
+ * Check to see if we need to continue a callback-flush operations to
+ * allow the last CPU to enter dyntick-idle mode.
+ */
+static void rcu_needs_cpu_flush(void)
+{
+        int cpu = smp_processor_id();
+        unsigned long flags;
+        if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
+                return;
+        local_irq_save(flags);
+        (void)rcu_needs_cpu(cpu);
+        local_irq_restore(flags);
+}
+#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b31c779e62e..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
+        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
+        seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
                   rdp->completed, rdp->gpnum,
@@ -155,12 +155,15 @@ static const struct file_operations rcudata_csv_fops = {
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
+        unsigned long gpnum;
        int level = 0;
+        int phase;
        struct rcu_node *rnp;
-        seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+        gpnum = rsp->gpnum;
+        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
-                   rsp->completed, rsp->gpnum, rsp->signaled,
+                   rsp->completed, gpnum, rsp->signaled,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
@@ -171,8 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
                        seq_puts(m, "\n");
                        level = rnp->level;
                }
-                seq_printf(m, "%lx/%lx %d:%d ^%d    ",
+                phase = gpnum & 0x1;
+                seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d    ",
                           rnp->qsmask, rnp->qsmaskinit,
+                           "T."[list_empty(&rnp->blocked_tasks[phase])],
+                           "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
+                           "T."[list_empty(&rnp->blocked_tasks[!phase])],
+                           "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
                           rnp->grplo, rnp->grphi, rnp->grpnum);
        }
        seq_puts(m, "\n");
@@ -207,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
 static int show_rcugp(struct seq_file *m, void *unused)
 {
 #ifdef CONFIG_TREE_PREEMPT_RCU
-        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_preempt: completed=%ld  gpnum=%lu\n",
                   rcu_preempt_state.completed, rcu_preempt_state.gpnum);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_sched: completed=%ld  gpnum=%lu\n",
                   rcu_sched_state.completed, rcu_sched_state.gpnum);
-        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+        seq_printf(m, "rcu_bh: completed=%ld  gpnum=%lu\n",
                   rcu_bh_state.completed, rcu_bh_state.gpnum);
        return 0;
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index 760c26209a3c..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
        relay_consume_bytes(rbuf, buf->private);
 }
-static struct pipe_buf_operations relay_pipe_buf_ops = {
+static const struct pipe_buf_operations relay_pipe_buf_ops = {
        .can_merge = 0,
        .map = generic_pipe_buf_map,
        .unmap = generic_pipe_buf_unmap,
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
 /*
 *      subbuf_splice_actor - splice up to one subbuf's worth of data
 */
-static int subbuf_splice_actor(struct file *in,
+static ssize_t subbuf_splice_actor(struct file *in,
                               loff_t *ppos,
                               struct pipe_inode_info *pipe,
                               size_t len,
                               unsigned int flags,
                               int *nonpad_ret)
 {
-        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
+        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
        struct rchan_buf *rbuf = in->private_data;
        unsigned int subbuf_size = rbuf->chan->subbuf_size;
        uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
                .ops = &relay_pipe_buf_ops,
                .spd_release = relay_page_release,
        };
+        ssize_t ret;
        if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
                return 0;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..9c358e263534 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,20 +188,65 @@ static int __release_resource(struct resource *old)
        return -EINVAL;
 }
+static void __release_child_resources(struct resource *r)
+{
+        struct resource *tmp, *p;
+        resource_size_t size;
+        p = r->child;
+        r->child = NULL;
+        while (p) {
+                tmp = p;
+                p = p->sibling;
+                tmp->parent = NULL;
+                tmp->sibling = NULL;
+                __release_child_resources(tmp);
+                printk(KERN_DEBUG "release child resource %pR\n", tmp);
+                /* need to restore size, and keep flags */
+                size = resource_size(tmp);
+                tmp->start = 0;
+                tmp->end = size - 1;
+        }
+}
+void release_child_resources(struct resource *r)
+{
+        write_lock(&resource_lock);
+        __release_child_resources(r);
+        write_unlock(&resource_lock);
+}
 /**
- * request_resource - request and reserve an I/O or memory resource
+ * request_resource_conflict - request and reserve an I/O or memory resource
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 *
- * Returns 0 for success, negative error code on error.
+ * Returns 0 for success, conflict resource on error.
 */
-int request_resource(struct resource *root, struct resource *new)
+struct resource *request_resource_conflict(struct resource *root, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __request_resource(root, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = request_resource_conflict(root, new);
        return conflict ? -EBUSY : 0;
 }
@@ -274,7 +319,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
        struct resource res;
-        unsigned long pfn, len;
+        unsigned long pfn, end_pfn;
        u64 orig_end;
        int ret = -1;
@@ -284,9 +329,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        orig_end = res.end;
        while ((res.start < res.end) &&
                (find_next_system_ram(&res, "System RAM") >= 0)) {
-                pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
+                end_pfn = (res.end + 1) >> PAGE_SHIFT;
-                ret = (*func)(pfn, len, arg);
+                if (end_pfn > pfn)
+                        ret = (*func)(pfn, end_pfn - pfn, arg);
                if (ret)
                        break;
                res.start = res.end + 1;
@@ -297,46 +343,63 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 #endif
+static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+        return 1;
+}
+/*
+ * This generic page_is_ram() returns true if specified address is
+ * registered as "System RAM" in iomem_resource list.
+ */
+int __weak page_is_ram(unsigned long pfn)
+{
+        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
+}
 /*
 * Find empty slot in the resource tree given range and alignment.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
                         resource_size_t max, resource_size_t align,
-                         void (*alignf)(void *, struct resource *,
+                         resource_size_t (*alignf)(void *,
-                                        resource_size_t, resource_size_t),
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
                         void *alignf_data)
 {
        struct resource *this = root->child;
+        struct resource tmp = *new;
-        new->start = root->start;
+        tmp.start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
-         * of this->start - 1 to new->end below would cause an underflow.
+         * of this->start - 1 to tmp->end below would cause an underflow.
         */
        if (this && this->start == 0) {
-                new->start = this->end + 1;
+                tmp.start = this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        new->end = this->start - 1;
+                        tmp.end = this->start - 1;
                else
-                        new->end = root->end;
+                        tmp.end = root->end;
-                if (new->start < min)
+                if (tmp.start < min)
-                        new->start = min;
+                        tmp.start = min;
-                if (new->end > max)
+                if (tmp.end > max)
-                        new->end = max;
+                        tmp.end = max;
-                new->start = ALIGN(new->start, align);
+                tmp.start = ALIGN(tmp.start, align);
                if (alignf)
-                        alignf(alignf_data, new, size, align);
+                        tmp.start = alignf(alignf_data, &tmp, size, align);
-                if (new->start < new->end && new->end - new->start >= size - 1) {
+                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
-                        new->end = new->start + size - 1;
+                        new->start = tmp.start;
+                        new->end = tmp.start + size - 1;
                        return 0;
                }
                if (!this)
                        break;
-                new->start = this->end + 1;
+                tmp.start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
@@ -356,8 +419,10 @@ static int find_resource(struct resource *root, struct resource *new,
 int allocate_resource(struct resource *root, struct resource *new,
                      resource_size_t size, resource_size_t min,
                      resource_size_t max, resource_size_t align,
-                      void (*alignf)(void *, struct resource *,
+                      resource_size_t (*alignf)(void *,
-                                     resource_size_t, resource_size_t),
+                                                const struct resource *,
+                                                resource_size_t,
+                                                resource_size_t),
                      void *alignf_data)
 {
        int err;
@@ -424,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
 }
 /**
- * insert_resource - Inserts a resource in the resource tree
+ * insert_resource_conflict - Inserts resource in the resource tree
 * @parent: parent of the new resource
 * @new: new resource to insert
 *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ * Returns 0 on success, conflict resource if the resource can't be inserted.
 *
- * This function is equivalent to request_resource when no conflict
+ * This function is equivalent to request_resource_conflict when no conflict
 * happens. If a conflict happens, and the conflicting resources
 * entirely fit within the range of the new resource, then the new
 * resource is inserted and the conflicting resources become children of
 * the new resource.
 */
-int insert_resource(struct resource *parent, struct resource *new)
+struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __insert_resource(parent, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = insert_resource_conflict(parent, new);
        return conflict ? -EBUSY : 0;
 }
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e2..ddabb54bb5c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do {								\
        if (rt_trace_on) {                                      \
                rt_trace_on = 0;                                \
                console_verbose();                              \
-                if (spin_is_locked(&current->pi_lock))          \
+                if (raw_spin_is_locked(&current->pi_lock))      \
-                        spin_unlock(&current->pi_lock);         \
+                        raw_spin_unlock(&current->pi_lock);     \
        }                                                       \
 } while (0)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e75..a9604815786a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
 {
        unsigned long flags;
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        __rt_mutex_adjust_prio(task);
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 }
 /*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /*
         * Task can not go away as we did a get_task() before !
         */
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        waiter = task->pi_blocked_on;
        /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                goto out_unlock_pi;
        lock = waiter->lock;
-        if (!spin_trylock(&lock->wait_lock)) {
+        if (!raw_spin_trylock(&lock->wait_lock)) {
-                spin_unlock_irqrestore(&task->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                cpu_relax();
                goto retry;
        }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /* Deadlock detection */
        if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
                debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                ret = deadlock_detect ? -EDEADLK : 0;
                goto out_unlock_pi;
        }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        plist_add(&waiter->list_entry, &lock->wait_list);
        /* Release the task */
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        put_task_struct(task);
        /* Grab the next task */
        task = rt_mutex_owner(lock);
        get_task_struct(task);
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
                /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                __rt_mutex_adjust_prio(task);
        }
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        top_waiter = rt_mutex_top_waiter(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        if (!detect_deadlock && waiter != top_waiter)
                goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        goto again;
 out_unlock_pi:
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 out_put_task:
        put_task_struct(task);
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
        if (pendowner == task)
                return 1;
-        spin_lock_irqsave(&pendowner->pi_lock, flags);
+        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
        if (task->prio >= pendowner->prio) {
-                spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                return 0;
        }
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
         * priority.
         */
        if (likely(!rt_mutex_has_waiters(lock))) {
-                spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                return 1;
        }
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
        next = rt_mutex_top_waiter(lock);
        plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
        __rt_mutex_adjust_prio(pendowner);
-        spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
        /*
         * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
         * might be task:
         */
        if (likely(next->task != task)) {
-                spin_lock_irqsave(&task->pi_lock, flags);
+                raw_spin_lock_irqsave(&task->pi_lock, flags);
                plist_add(&next->pi_list_entry, &task->pi_waiters);
                __rt_mutex_adjust_prio(task);
-                spin_unlock_irqrestore(&task->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        }
        return 1;
 }
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0, res;
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        __rt_mutex_adjust_prio(task);
        waiter->task = task;
        waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        task->pi_blocked_on = waiter;
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
-                spin_lock_irqsave(&owner->pi_lock, flags);
+                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
                plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
                __rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
                        chain_walk = 1;
-                spin_unlock_irqrestore(&owner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
        else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
                chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
         */
        get_task_struct(owner);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
                                         task);
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        return res;
 }
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
        struct task_struct *pendowner;
        unsigned long flags;
-        spin_lock_irqsave(&current->pi_lock, flags);
+        raw_spin_lock_irqsave(&current->pi_lock, flags);
        waiter = rt_mutex_top_waiter(lock);
        plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
        rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
-        spin_unlock_irqrestore(&current->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
        /*
         * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         * waiter with higher priority than pending-owner->normal_prio
         * is blocked on the unboosted (pending) owner.
         */
-        spin_lock_irqsave(&pendowner->pi_lock, flags);
+        raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
        WARN_ON(!pendowner->pi_blocked_on);
        WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
                next = rt_mutex_top_waiter(lock);
                plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
        }
-        spin_unlock_irqrestore(&pendowner->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
        wake_up_process(pendowner);
 }
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0;
-        spin_lock_irqsave(&current->pi_lock, flags);
+        raw_spin_lock_irqsave(&current->pi_lock, flags);
        plist_del(&waiter->list_entry, &lock->wait_list);
        waiter->task = NULL;
        current->pi_blocked_on = NULL;
-        spin_unlock_irqrestore(&current->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
        if (first && owner != current) {
-                spin_lock_irqsave(&owner->pi_lock, flags);
+                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
                if (owner->pi_blocked_on)
                        chain_walk = 1;
-                spin_unlock_irqrestore(&owner->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
        WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(owner);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
 }
 /*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        struct rt_mutex_waiter *waiter;
        unsigned long flags;
-        spin_lock_irqsave(&task->pi_lock, flags);
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
        waiter = task->pi_blocked_on;
        if (!waiter || waiter->list_entry.prio == task->prio) {
-                spin_unlock_irqrestore(&task->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
-        spin_unlock_irqrestore(&task->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                                break;
                }
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                debug_rt_mutex_print_deadlock(waiter);
                if (waiter->task)
                        schedule_rt_mutex(lock);
-                spin_lock(&lock->wait_lock);
+                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        debug_rt_mutex_init_waiter(&waiter);
        waiter.task = NULL;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        /* Try to acquire the lock again: */
        if (try_to_take_rt_mutex(lock)) {
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                return 0;
        }
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
         */
        fixup_rt_mutex_waiters(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        /* Remove pending timer: */
        if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
        int ret = 0;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        if (likely(rt_mutex_owner(lock) != current)) {
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
                fixup_rt_mutex_waiters(lock);
        }
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        return ret;
 }
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
 static void __sched
 rt_mutex_slowunlock(struct rt_mutex *lock)
 {
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        debug_rt_mutex_unlock(lock);
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
        if (!rt_mutex_has_waiters(lock)) {
                lock->owner = NULL;
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                return;
        }
        wakeup_next_waiter(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        /* Undo pi boosting if necessary: */
        rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
 void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 {
        lock->owner = NULL;
-        spin_lock_init(&lock->wait_lock);
+        raw_spin_lock_init(&lock->wait_lock);
-        plist_head_init(&lock->wait_list, &lock->wait_lock);
+        plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
        debug_rt_mutex_init(lock, name);
 }
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 {
        int ret;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        mark_rt_mutex_waiters(lock);
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                /* We got the lock for task. */
                debug_rt_mutex_lock(lock);
                rt_mutex_set_owner(lock, task, 0);
-                spin_unlock(&lock->wait_lock);
+                raw_spin_unlock(&lock->wait_lock);
                rt_mutex_deadlock_account_lock(lock, task);
                return 1;
        }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                 */
                ret = 0;
        }
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        debug_rt_mutex_print_deadlock(waiter);
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 {
        int ret;
-        spin_lock(&lock->wait_lock);
+        raw_spin_lock(&lock->wait_lock);
        set_current_state(TASK_INTERRUPTIBLE);
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
         */
        fixup_rt_mutex_waiters(lock);
-        spin_unlock(&lock->wait_lock);
+        raw_spin_unlock(&lock->wait_lock);
        /*
         * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index adb5e923cc61..5e3c509e0efe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -144,7 +145,7 @@ struct rt_prio_array {
 struct rt_bandwidth {
        /* nests inside the rq lock: */
-        spinlock_t              rt_runtime_lock;
+        raw_spinlock_t          rt_runtime_lock;
        ktime_t                 rt_period;
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
@@ -181,7 +182,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
        rt_b->rt_period = ns_to_ktime(period);
        rt_b->rt_runtime = runtime;
-        spin_lock_init(&rt_b->rt_runtime_lock);
+        raw_spin_lock_init(&rt_b->rt_runtime_lock);
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -203,7 +204,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        if (hrtimer_active(&rt_b->rt_period_timer))
                return;
-        spin_lock(&rt_b->rt_runtime_lock);
+        raw_spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
                unsigned long delta;
                ktime_t soft, hard;
@@ -220,7 +221,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
                                HRTIMER_MODE_ABS_PINNED, 0);
        }
-        spin_unlock(&rt_b->rt_runtime_lock);
+        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -236,7 +237,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 */
 static DEFINE_MUTEX(sched_domains_mutex);
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 #include <linux/cgroup.h>
@@ -246,13 +247,7 @@ static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_CGROUP_SCHED
        struct cgroup_subsys_state css;
-#endif
-#ifdef CONFIG_USER_SCHED
-        uid_t uid;
-#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
@@ -277,35 +272,7 @@ struct task_group {
        struct list_head children;
 };
-#ifdef CONFIG_USER_SCHED
-/* Helper function to pass uid information to create_sched_user() */
-void set_tg_uid(struct user_struct *user)
-{
-        user->tg->uid = user->uid;
-}
-/*
- * Root task group.
- *      Every UID task group (including init_task_group aka UID-0) will
- *      be a child to this group.
- */
-struct task_group root_task_group;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Default task group's sched entity on each cpu */
-static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
-/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -321,11 +288,7 @@ static int root_task_group_empty(void)
 }
 #endif
-#ifdef CONFIG_USER_SCHED
-# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
-#endif /* CONFIG_USER_SCHED */
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -351,11 +314,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 {
        struct task_group *tg;
-#ifdef CONFIG_USER_SCHED
+#ifdef CONFIG_CGROUP_SCHED
-        rcu_read_lock();
-        tg = __task_cred(p)->user->tg;
-        rcu_read_unlock();
-#elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
 #else
@@ -367,6 +326,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+        /*
+         * Strictly speaking this rcu_read_lock() is not needed since the
+         * task_group is tied to the cgroup, which in turn can never go away
+         * as long as there are tasks attached to it.
+         *
+         * However since task_group() uses task_subsys_state() which is an
+         * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
+         */
+        rcu_read_lock();
 #ifdef CONFIG_FAIR_GROUP_SCHED
        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
        p->se.parent = task_group(p)->se[cpu];
@@ -376,6 +344,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
        p->rt.parent = task_group(p)->rt_se[cpu];
 #endif
+        rcu_read_unlock();
 }
 #else
@@ -386,7 +355,7 @@ static inline struct task_group *task_group(struct task_struct *p)
        return NULL;
 }
-#endif  /* CONFIG_GROUP_SCHED */
+#endif  /* CONFIG_CGROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -473,7 +442,7 @@ struct rt_rq {
        u64 rt_time;
        u64 rt_runtime;
        /* Nests inside the rq lock: */
-        spinlock_t rt_runtime_lock;
+        raw_spinlock_t rt_runtime_lock;
 #ifdef CONFIG_RT_GROUP_SCHED
        unsigned long rt_nr_boosted;
@@ -481,7 +450,6 @@ struct rt_rq {
        struct rq *rq;
        struct list_head leaf_rt_rq_list;
        struct task_group *tg;
-        struct sched_rt_entity *rt_se;
 #endif
 };
@@ -534,7 +502,7 @@ static struct root_domain def_root_domain;
 */
 struct rq {
        /* runqueue lock: */
-        spinlock_t lock;
+        raw_spinlock_t lock;
        /*
         * nr_running and cpu_load should be in the same cacheline because
@@ -544,14 +512,12 @@ struct rq {
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
-        unsigned long last_tick_seen;
        unsigned char in_nohz_recently;
 #endif
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
-        u64 nr_migrations_in;
        struct cfs_rq cfs;
        struct rt_rq rt;
@@ -601,6 +567,8 @@ struct rq {
        u64 rt_avg;
        u64 age_stamp;
+        u64 idle_stamp;
+        u64 avg_idle;
 #endif
        /* calc_load related fields */
@@ -655,6 +623,11 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
+#define rcu_dereference_check_sched_domain(p) \
+        rcu_dereference_check((p), \
+                              rcu_read_lock_sched_held() || \
+                              lockdep_is_held(&sched_domains_mutex))
 /*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
@@ -663,7 +636,7 @@ static inline int cpu_of(struct rq *rq)
 * preempt-disabled sections.
 */
 #define for_each_domain(cpu, __sd) \
-        for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 #define this_rq()               (&__get_cpu_var(runqueues))
@@ -695,7 +668,7 @@ inline void update_rq_clock(struct rq *rq)
 */
 int runqueue_is_locked(int cpu)
 {
-        return spin_is_locked(&cpu_rq(cpu)->lock);
+        return raw_spin_is_locked(&cpu_rq(cpu)->lock);
 }
 /*
@@ -782,7 +755,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        if (!sched_feat_names[i])
                return -EINVAL;
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
@@ -824,6 +797,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 * default: 0.25ms
 */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
 /*
 * Inject some fuzzyness into changing the per-cpu group shares
@@ -902,7 +876,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
         */
        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -926,9 +900,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
        next->oncpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
 #else
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
 #endif
 }
@@ -950,18 +924,35 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
+ * Check whether the task is waking, we use this to synchronize against
+ * ttwu() so that task_cpu() reports a stable number.
+ *
+ * We need to make an exception for PF_STARTING tasks because the fork
+ * path might require task_rq_lock() to work, eg. it can call
+ * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+        return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+}
+/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
+        struct rq *rq;
        for (;;) {
-                struct rq *rq = task_rq(p);
+                while (task_is_waking(p))
-                spin_lock(&rq->lock);
+                        cpu_relax();
-                if (likely(rq == task_rq(p)))
+                rq = task_rq(p);
+                raw_spin_lock(&rq->lock);
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
-                spin_unlock(&rq->lock);
+                raw_spin_unlock(&rq->lock);
        }
 }
@@ -976,12 +967,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        struct rq *rq;
        for (;;) {
+                while (task_is_waking(p))
+                        cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
-                spin_lock(&rq->lock);
+                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
-                spin_unlock_irqrestore(&rq->lock, *flags);
+                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
 }
@@ -990,19 +983,19 @@ void task_rq_unlock_wait(struct task_struct *p)
        struct rq *rq = task_rq(p);
        smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-        spin_unlock_wait(&rq->lock);
+        raw_spin_unlock_wait(&rq->lock);
 }
 static void __task_rq_unlock(struct rq *rq)
        __releases(rq->lock)
 {
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
 }
 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
        __releases(rq->lock)
 {
-        spin_unlock_irqrestore(&rq->lock, *flags);
+        raw_spin_unlock_irqrestore(&rq->lock, *flags);
 }
 /*
@@ -1015,7 +1008,7 @@ static struct rq *this_rq_lock(void)
        local_irq_disable();
        rq = this_rq();
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        return rq;
 }
@@ -1062,10 +1055,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
        return HRTIMER_NORESTART;
 }
@@ -1078,10 +1071,10 @@ static void __hrtick_start(void *arg)
 {
        struct rq *rq = arg;
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        hrtimer_restart(&rq->hrtick_timer);
        rq->hrtick_csd_pending = 0;
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
 }
 /*
@@ -1188,7 +1181,7 @@ static void resched_task(struct task_struct *p)
 {
        int cpu;
-        assert_spin_locked(&task_rq(p)->lock);
+        assert_raw_spin_locked(&task_rq(p)->lock);
        if (test_tsk_need_resched(p))
                return;
@@ -1210,10 +1203,10 @@ static void resched_cpu(int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        if (!spin_trylock_irqsave(&rq->lock, flags))
+        if (!raw_spin_trylock_irqsave(&rq->lock, flags))
                return;
        resched_task(cpu_curr(cpu));
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 #ifdef CONFIG_NO_HZ
@@ -1282,7 +1275,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 #else /* !CONFIG_SMP */
 static void resched_task(struct task_struct *p)
 {
-        assert_spin_locked(&task_rq(p)->lock);
+        assert_raw_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
 }
@@ -1399,32 +1392,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
-        void *arg;
-        struct task_struct *(*start)(void *);
-        struct task_struct *(*next)(void *);
-};
-#ifdef CONFIG_SMP
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator);
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator);
-#endif
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
        CPUACCT_STAT_USER,      /* ... user mode */
@@ -1540,7 +1507,7 @@ static unsigned long target_load(int cpu, int type)
 static struct sched_group *group_of(int cpu)
 {
-        struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+        struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
        if (!sd)
                return NULL;
@@ -1575,7 +1542,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1609,11 +1576,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
                struct rq *rq = cpu_rq(cpu);
                unsigned long flags;
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                __set_se_shares(tg->se[cpu], shares);
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
 }
@@ -1624,7 +1591,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long weight, rq_weight = 0, shares = 0;
+        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
        unsigned long *usd_rq_weight;
        struct sched_domain *sd = data;
        unsigned long flags;
@@ -1640,6 +1607,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
                weight = tg->cfs_rq[i]->load.weight;
                usd_rq_weight[i] = weight;
+                rq_weight += weight;
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
@@ -1648,10 +1616,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
                if (!weight)
                        weight = NICE_0_LOAD;
-                rq_weight += weight;
+                sum_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
+        if (!rq_weight)
+                rq_weight = sum_weight;
        if ((!shares && rq_weight) || shares > tg->shares)
                shares = tg->shares;
@@ -1706,16 +1677,6 @@ static void update_shares(struct sched_domain *sd)
        }
 }
-static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-        if (root_task_group_empty())
-                return;
-        spin_unlock(&rq->lock);
-        update_shares(sd);
-        spin_lock(&rq->lock);
-}
 static void update_h_load(long cpu)
 {
        if (root_task_group_empty())
@@ -1730,10 +1691,6 @@ static inline void update_shares(struct sched_domain *sd)
 {
 }
-static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-}
 #endif
 #ifdef CONFIG_PREEMPT
@@ -1753,7 +1710,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
-        spin_unlock(&this_rq->lock);
+        raw_spin_unlock(&this_rq->lock);
        double_rq_lock(this_rq, busiest);
        return 1;
@@ -1774,14 +1731,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
        int ret = 0;
-        if (unlikely(!spin_trylock(&busiest->lock))) {
+        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
-                        spin_unlock(&this_rq->lock);
+                        raw_spin_unlock(&this_rq->lock);
-                        spin_lock(&busiest->lock);
+                        raw_spin_lock(&busiest->lock);
-                        spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                        raw_spin_lock_nested(&this_rq->lock,
+                                              SINGLE_DEPTH_NESTING);
                        ret = 1;
                } else
-                        spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+                        raw_spin_lock_nested(&busiest->lock,
+                                              SINGLE_DEPTH_NESTING);
        }
        return ret;
 }
@@ -1795,7 +1754,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
        if (unlikely(!irqs_disabled())) {
                /* printk() doesn't work good under rq->lock */
-                spin_unlock(&this_rq->lock);
+                raw_spin_unlock(&this_rq->lock);
                BUG_ON(1);
        }
@@ -1805,9 +1764,54 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
 {
-        spin_unlock(&busiest->lock);
+        raw_spin_unlock(&busiest->lock);
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        if (rq1 == rq2) {
+                raw_spin_lock(&rq1->lock);
+                __acquire(rq2->lock);   /* Fake it out ;) */
+        } else {
+                if (rq1 < rq2) {
+                        raw_spin_lock(&rq1->lock);
+                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                } else {
+                        raw_spin_lock(&rq2->lock);
+                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                }
+        }
+        update_rq_clock(rq1);
+        update_rq_clock(rq2);
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        raw_spin_unlock(&rq1->lock);
+        if (rq1 != rq2)
+                raw_spin_unlock(&rq2->lock);
+        else
+                __release(rq2->lock);
+}
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1820,20 +1824,31 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);
+static int get_update_sysctl_factor(void);
-#include "sched_stats.h"
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-#include "sched_idletask.c"
+{
-#include "sched_fair.c"
+        set_task_rq(p, cpu);
-#include "sched_rt.c"
+#ifdef CONFIG_SMP
-#include "../litmus/sched_litmus.c"
+        /*
-#ifdef CONFIG_SCHED_DEBUG
+         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-# include "sched_debug.c"
+         * successfuly executed on another CPU. We must ensure that updates of
+         * per-task data have been completed by this moment.
+         */
+        smp_wmb();
+        task_thread_info(p)->cpu = cpu;
 #endif
+}
+static const struct sched_class rt_sched_class;
 #define sched_class_highest (&litmus_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
+#include "sched_stats.h"
 static void inc_nr_running(struct rq *rq)
 {
        rq->nr_running++;
@@ -1871,13 +1886,14 @@ static void update_avg(u64 *avg, u64 sample)
        *avg += diff >> 3;
 }
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        if (wakeup)
                p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
-        p->sched_class->enqueue_task(rq, p, wakeup);
+        p->sched_class->enqueue_task(rq, p, wakeup, head);
        p->se.on_rq = 1;
 }
@@ -1900,6 +1916,38 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 }
 /*
+ * activate_task - move a task to the runqueue.
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+{
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible--;
+        enqueue_task(rq, p, wakeup, false);
+        inc_nr_running(rq);
+}
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+{
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible++;
+        dequeue_task(rq, p, sleep);
+        dec_nr_running(rq);
+}
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#include "../litmus/sched_litmus.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
+/*
 * __normal_prio - return the priority that is based on the static prio
 */
 static inline int __normal_prio(struct task_struct *p)
@@ -1945,30 +1993,6 @@ static int effective_prio(struct task_struct *p)
        return p->prio;
 }
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, wakeup);
-        inc_nr_running(rq);
-}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible++;
-        dequeue_task(rq, p, sleep);
-        dec_nr_running(rq);
-}
 /**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
@@ -1978,20 +2002,6 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-        set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-        /*
-         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfuly executed on another CPU. We must ensure that updates of
-         * per-task data have been completed by this moment.
-         */
-        smp_wmb();
-        task_thread_info(p)->cpu = cpu;
-#endif
-}
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
                                       int oldprio, int running)
@@ -2004,38 +2014,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                p->sched_class->prio_changed(rq, p, oldprio, running);
 }
-/**
- * kthread_bind - bind a just-created kthread to a cpu.
- * @p: thread created by kthread_create().
- * @cpu: cpu (might not be online, must be possible) for @k to run on.
- *
- * Description: This function is equivalent to set_cpus_allowed(),
- * except that @cpu doesn't need to be online, and the thread must be
- * stopped (i.e., just returned from kthread_create()).
- *
- * Function lives here instead of kthread.c because it messes with
- * scheduler internals which require locking.
- */
-void kthread_bind(struct task_struct *p, unsigned int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        /* Must have done schedule() in kthread() before we set_task_cpu */
-        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
-                WARN_ON(1);
-                return;
-        }
-        spin_lock_irqsave(&rq->lock, flags);
-        set_task_cpu(p, cpu);
-        p->cpus_allowed = cpumask_of_cpu(cpu);
-        p->rt.nr_cpus_allowed = 1;
-        p->flags |= PF_THREAD_BOUND;
-        spin_unlock_irqrestore(&rq->lock, flags);
-}
-EXPORT_SYMBOL(kthread_bind);
 #ifdef CONFIG_SMP
 /*
 * Is this task likely cache-hot:
@@ -2045,6 +2023,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
        s64 delta;
+        if (p->sched_class != &fair_sched_class)
+                return 0;
        /*
         * Buddy candidates are cache hot:
         */
@@ -2053,9 +2034,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
                         &p->se == cfs_rq_of(&p->se)->last))
                return 1;
-        if (p->sched_class != &fair_sched_class)
-                return 0;
        if (sysctl_sched_migration_cost == -1)
                return 1;
        if (sysctl_sched_migration_cost == 0)
@@ -2066,39 +2044,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
-        int old_cpu = task_cpu(p);
+#ifdef CONFIG_SCHED_DEBUG
-        struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
+        /*
-        struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+         * We should never call set_task_cpu() on a blocked task,
-                      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
+         * ttwu() will sort out the placement.
-        u64 clock_offset;
+         */
+        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-        clock_offset = old_rq->clock - new_rq->clock;
+                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+#endif
        trace_sched_migrate_task(p, new_cpu);
-#ifdef CONFIG_SCHEDSTATS
+        if (task_cpu(p) != new_cpu) {
-        if (p->se.wait_start)
-                p->se.wait_start -= clock_offset;
-        if (p->se.sleep_start)
-                p->se.sleep_start -= clock_offset;
-        if (p->se.block_start)
-                p->se.block_start -= clock_offset;
-#endif
-        if (old_cpu != new_cpu) {
                p->se.nr_migrations++;
-                new_rq->nr_migrations_in++;
+                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
-#ifdef CONFIG_SCHEDSTATS
-                if (task_hot(p, old_rq->clock, NULL))
-                        schedstat_inc(p, se.nr_forced2_migrations);
-#endif
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
-                                     1, 1, NULL, 0);
        }
-        p->se.vruntime -= old_cfsrq->min_vruntime -
-                                         new_cfsrq->min_vruntime;
        __set_task_cpu(p, new_cpu);
 }
@@ -2123,12 +2085,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
        /*
         * If the task is not on a runqueue (and not running), then
-         * it is sufficient to simply update the task's cpu field.
+         * the next wake-up will properly place the task.
         */
-        if (!p->se.on_rq && !task_running(rq, p)) {
+        if (!p->se.on_rq && !task_running(rq, p))
-                set_task_cpu(p, dest_cpu);
                return 0;
-        }
        init_completion(&req->done);
        req->task = p;
@@ -2333,6 +2293,75 @@ void task_oncpu_function_call(struct task_struct *p,
        preempt_enable();
 }
+#ifdef CONFIG_SMP
+static int select_fallback_rq(int cpu, struct task_struct *p)
+{
+        int dest_cpu;
+        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        /* Look for allowed, online CPU in same node. */
+        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                        return dest_cpu;
+        /* Any allowed, online CPU? */
+        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+        if (dest_cpu < nr_cpu_ids)
+                return dest_cpu;
+        /* No more Mr. Nice Guy. */
+        if (dest_cpu >= nr_cpu_ids) {
+                rcu_read_lock();
+                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+                rcu_read_unlock();
+                dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+                /*
+                 * Don't tell them about moving exiting tasks or
+                 * kernel threads (both mm NULL), since they never
+                 * leave kernel.
+                 */
+                if (p->mm && printk_ratelimit()) {
+                        printk(KERN_INFO "process %d (%s) no "
+                               "longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
+                }
+        }
+        return dest_cpu;
+}
+/*
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+ * by:
+ *
+ *  exec:           is unstable, retry loop
+ *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ */
+static inline
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+{
+        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        /*
+         * In order not to call set_task_cpu() on a blocking task we need
+         * to rely on ttwu() to place the task on a valid ->cpus_allowed
+         * cpu.
+         *
+         * Since this is common to all placement strategies, this lives here.
+         *
+         * [ this allows ->select_task() to simply return task_cpu(p) and
+         *   not worry about this generic constraint ]
+         */
+        if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+                     !cpu_online(cpu)))
+                cpu = select_fallback_rq(task_cpu(p), p);
+        return cpu;
+}
+#endif
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -2352,7 +2381,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        struct rq *rq, *orig_rq;
+        struct rq *rq;
        if (is_realtime(p))
                TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
@@ -2363,7 +2392,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        this_cpu = get_cpu();
        smp_wmb();
-        rq = orig_rq = task_rq_lock(p, &flags);
+        rq = task_rq_lock(p, &flags);
        update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
@@ -2387,19 +2416,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
        p->state = TASK_WAKING;
-        task_rq_unlock(rq, &flags);
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        if (p->sched_class->task_waking)
-        if (cpu != orig_cpu)
+                p->sched_class->task_waking(rq, p);
-                set_task_cpu(p, cpu);
-        rq = task_rq_lock(p, &flags);
+        __task_rq_unlock(rq);
-        if (rq != orig_rq)
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-                update_rq_clock(rq);
+        if (cpu != orig_cpu) {
+                /*
+                 * Since we migrate the task without holding any rq->lock,
+                 * we need to be careful with task_rq_lock(), since that
+                 * might end up locking an invalid rq.
+                 */
+                set_task_cpu(p, cpu);
+        }
+        rq = cpu_rq(cpu);
+        raw_spin_lock(&rq->lock);
+        update_rq_clock(rq);
+        /*
+         * We migrated the task without holding either rq->lock, however
+         * since the task is not on the task list itself, nobody else
+         * will try and migrate the task, hence the rq should match the
+         * cpu we just moved it to.
+         */
+        WARN_ON(task_cpu(p) != cpu);
        WARN_ON(p->state != TASK_WAKING);
-        cpu = task_cpu(p);
 #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@ -2452,8 +2496,19 @@ out_running:
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-        if (p->sched_class->task_wake_up)
+        if (p->sched_class->task_woken)
-                p->sched_class->task_wake_up(rq, p);
+                p->sched_class->task_woken(rq, p);
+        if (unlikely(rq->idle_stamp)) {
+                u64 delta = rq->clock - rq->idle_stamp;
+                u64 max = 2*sysctl_sched_migration_cost;
+                if (delta > max)
+                        rq->avg_idle = max;
+                else
+                        update_avg(&rq->avg_idle, delta);
+                rq->idle_stamp = 0;
+        }
 #endif
 out:
        if (is_realtime(p))
@@ -2502,7 +2557,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
-        p->se.avg_running               = 0;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                        = 0;
@@ -2524,7 +2578,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
@@ -2545,14 +2598,6 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
-        /*
-         * We mark the process as running here, but have not actually
-         * inserted it onto the runqueue yet. This guarantees that
-         * nobody will actually run it, and a signal or other external
-         * event cannot wake it up and insert it on the runqueue either.
-         */
-        p->state = TASK_RUNNING;
 }
 /*
@@ -2563,6 +2608,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
        int cpu = get_cpu();
        __sched_fork(p);
+        /*
+         * We mark the process as waking here. This guarantees that
+         * nobody will actually run it, and a signal or other external
+         * event cannot wake it up and insert it on the runqueue either.
+         */
+        p->state = TASK_WAKING;
        /*
         * Revert to default priority/policy on fork if requested.
@@ -2594,9 +2645,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
-#ifdef CONFIG_SMP
+        if (p->sched_class->task_fork)
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+                p->sched_class->task_fork(p);
-#endif
        set_task_cpu(p, cpu);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2626,28 +2677,41 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
+        int cpu __maybe_unused = get_cpu();
-        rq = task_rq_lock(p, &flags);
+#ifdef CONFIG_SMP
-        BUG_ON(p->state != TASK_RUNNING);
+        /*
-        update_rq_clock(rq);
+         * Fork balancing, do it here and not earlier because:
+         *  - cpus_allowed can change in the fork path
+         *  - any previously selected cpu might disappear through hotplug
+         *
+         * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+         * ->cpus_allowed is stable, we have preemption disabled, meaning
+         * cpu_online_mask is stable.
+         */
+        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, cpu);
+#endif
-        if (!p->sched_class->task_new || !current->se.on_rq) {
+        /*
-                activate_task(rq, p, 0);
+         * Since the task is not on the rq and we still have TASK_WAKING set
-        } else {
+         * nobody else will migrate this task.
-                /*
+         */
-                 * Let the scheduling class do new task startup
+        rq = cpu_rq(cpu);
-                 * management (if any):
+        raw_spin_lock_irqsave(&rq->lock, flags);
-                 */
-                p->sched_class->task_new(rq, p);
+        BUG_ON(p->state != TASK_WAKING);
-                inc_nr_running(rq);
+        p->state = TASK_RUNNING;
-        }
+        update_rq_clock(rq);
+        activate_task(rq, p, 0);
        trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
-        if (p->sched_class->task_wake_up)
+        if (p->sched_class->task_woken)
-                p->sched_class->task_wake_up(rq, p);
+                p->sched_class->task_woken(rq, p);
 #endif
        task_rq_unlock(rq, &flags);
+        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2768,7 +2832,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        finish_arch_switch(prev);
        litmus->finish_switch(prev);
        prev->rt_param.stack_in_use = NO_CPU;
-        perf_event_task_sched_in(current, cpu_of(rq));
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+        perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
        fire_sched_in_preempt_notifiers(current);
@@ -2808,10 +2878,10 @@ static inline void post_schedule(struct rq *rq)
        if (rq->post_schedule) {
                unsigned long flags;
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->curr->sched_class->post_schedule)
                        rq->curr->sched_class->post_schedule(rq);
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                rq->post_schedule = 0;
        }
@@ -2875,14 +2945,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
-        if (unlikely(!mm)) {
+        if (likely(!mm)) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm(oldmm, mm, next);
-        if (unlikely(!prev->mm)) {
+        if (likely(!prev->mm)) {
                prev->active_mm = NULL;
                rq->prev_mm = oldmm;
        }
@@ -3045,15 +3115,6 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
- * Externally visible per-cpu scheduler statistics:
- * cpu_nr_migrations(cpu) - number of migrations into that cpu
- */
-u64 cpu_nr_migrations(int cpu)
-{
-        return cpu_rq(cpu)->nr_migrations_in;
-}
-/*
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC).
 */
@@ -3091,65 +3152,36 @@ static void update_cpu_load(struct rq *this_rq)
 #ifdef CONFIG_SMP
 /*
- * double_rq_lock - safely lock two runqueues
+ * sched_exec - execve() is a valuable balancing opportunity, because at
- *
+ * this point the task has the smallest effective memory and cache footprint.
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        if (rq1 == rq2) {
-                spin_lock(&rq1->lock);
-                __acquire(rq2->lock);   /* Fake it out ;) */
-        } else {
-                if (rq1 < rq2) {
-                        spin_lock(&rq1->lock);
-                        spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-                } else {
-                        spin_lock(&rq2->lock);
-                        spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-                }
-        }
-        update_rq_clock(rq1);
-        update_rq_clock(rq2);
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        spin_unlock(&rq1->lock);
-        if (rq1 != rq2)
-                spin_unlock(&rq2->lock);
-        else
-                __release(rq2->lock);
-}
-/*
- * If dest_cpu is allowed for this process, migrate the task to it.
- * This is accomplished by forcing the cpu_allowed mask to only
- * allow dest_cpu, which will force the cpu onto dest_cpu. Then
- * the cpu_allowed mask is restored.
 */
-static void sched_migrate_task(struct task_struct *p, int dest_cpu)
+void sched_exec(void)
 {
+        struct task_struct *p = current;
        struct migration_req req;
+        int dest_cpu, this_cpu;
        unsigned long flags;
        struct rq *rq;
+again:
+        this_cpu = get_cpu();
+        dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
+        if (dest_cpu == this_cpu) {
+                put_cpu();
+                return;
+        }
        rq = task_rq_lock(p, &flags);
+        put_cpu();
+        /*
+         * select_task_rq() can race against ->cpus_allowed
+         */
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-            || unlikely(!cpu_active(dest_cpu)))
+            || unlikely(!cpu_active(dest_cpu))) {
-                goto out;
+                task_rq_unlock(rq, &flags);
+                goto again;
+        }
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
@@ -3164,1784 +3196,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
                return;
        }
-out:
        task_rq_unlock(rq, &flags);
 }
-/*
- * sched_exec - execve() is a valuable balancing opportunity, because at
- * this point the task has the smallest effective memory and cache footprint.
- */
-void sched_exec(void)
-{
-        int new_cpu, this_cpu = get_cpu();
-        new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
-        put_cpu();
-        if (new_cpu != this_cpu)
-                sched_migrate_task(current, new_cpu);
-}
-/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
- */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-                      struct rq *this_rq, int this_cpu)
-{
-        deactivate_task(src_rq, p, 0);
-        set_task_cpu(p, this_cpu);
-        activate_task(this_rq, p, 0);
-        /*
-         * Note that idle threads have a prio of MAX_PRIO, for this test
-         * to be always true for them.
-         */
-        check_preempt_curr(this_rq, p, 0);
-}
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
-{
-        int tsk_cache_hot = 0;
-        /*
-         * We do not migrate tasks that are:
-         * 1) running (obviously), or
-         * 2) cannot be migrated to this CPU due to cpus_allowed, or
-         * 3) are cache-hot on their current CPU.
-         */
-        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-                schedstat_inc(p, se.nr_failed_migrations_affine);
-                return 0;
-        }
-        *all_pinned = 0;
-        if (task_running(rq, p)) {
-                schedstat_inc(p, se.nr_failed_migrations_running);
-                return 0;
-        }
-        /*
-         * Aggressive migration if:
-         * 1) task is cache cold, or
-         * 2) too many balance attempts have failed.
-         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
-        if (!tsk_cache_hot ||
-                sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-                if (tsk_cache_hot) {
-                        schedstat_inc(sd, lb_hot_gained[idle]);
-                        schedstat_inc(p, se.nr_forced_migrations);
-                }
-#endif
-                return 1;
-        }
-        if (tsk_cache_hot) {
-                schedstat_inc(p, se.nr_failed_migrations_hot);
-                return 0;
-        }
-        return 1;
-}
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator)
-{
-        int loops = 0, pulled = 0, pinned = 0;
-        struct task_struct *p;
-        long rem_load_move = max_load_move;
-        if (max_load_move == 0)
-                goto out;
-        pinned = 1;
-        /*
-         * Start the load-balancing iterator:
-         */
-        p = iterator->start(iterator->arg);
-next:
-        if (!p || loops++ > sysctl_sched_nr_migrate)
-                goto out;
-        if ((p->se.load.weight >> 1) > rem_load_move ||
-            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-        pull_task(busiest, p, this_rq, this_cpu);
-        pulled++;
-        rem_load_move -= p->se.load.weight;
-#ifdef CONFIG_PREEMPT
-        /*
-         * NEWIDLE balancing is a source of latency, so preemptible kernels
-         * will stop after the first task is pulled to minimize the critical
-         * section.
-         */
-        if (idle == CPU_NEWLY_IDLE)
-                goto out;
-#endif
-        /*
-         * We only want to steal up to the prescribed amount of weighted load.
-         */
-        if (rem_load_move > 0) {
-                if (p->prio < *this_best_prio)
-                        *this_best_prio = p->prio;
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-out:
-        /*
-         * Right now, this is one of only two places pull_task() is called,
-         * so we can safely collect pull_task() stats here rather than
-         * inside pull_task().
-         */
-        schedstat_add(sd, lb_gained[idle], pulled);
-        if (all_pinned)
-                *all_pinned = pinned;
-        return max_load_move - rem_load_move;
-}
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      unsigned long max_load_move,
-                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned)
-{
-        const struct sched_class *class = sched_class_highest;
-        unsigned long total_load_moved = 0;
-        int this_best_prio = this_rq->curr->prio;
-        do {
-                total_load_moved +=
-                        class->load_balance(this_rq, this_cpu, busiest,
-                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned, &this_best_prio);
-                class = class->next;
-#ifdef CONFIG_PREEMPT
-                /*
-                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
-                 * the critical section.
-                 */
-                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                        break;
-#endif
-        } while (class && max_load_move > total_load_moved);
-        return total_load_moved > 0;
-}
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator)
-{
-        struct task_struct *p = iterator->start(iterator->arg);
-        int pinned = 0;
-        while (p) {
-                if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                        pull_task(busiest, p, this_rq, this_cpu);
-                        /*
-                         * Right now, this is only the second place pull_task()
-                         * is called, so we can safely collect pull_task()
-                         * stats here rather than inside pull_task().
-                         */
-                        schedstat_inc(sd, lb_gained[idle]);
-                        return 1;
-                }
-                p = iterator->next(iterator->arg);
-        }
-        return 0;
-}
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                         struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        const struct sched_class *class;
-        for_each_class(class) {
-                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
-                        return 1;
-        }
-        return 0;
-}
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *              during load balancing.
- */
-struct sd_lb_stats {
-        struct sched_group *busiest; /* Busiest group in this sd */
-        struct sched_group *this;  /* Local group in this sd */
-        unsigned long total_load;  /* Total load of all groups in sd */
-        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        unsigned long avg_load;    /* Average load across all groups in sd */
-        /** Statistics of this group */
-        unsigned long this_load;
-        unsigned long this_load_per_task;
-        unsigned long this_nr_running;
-        /* Statistics of the busiest group */
-        unsigned long max_load;
-        unsigned long busiest_load_per_task;
-        unsigned long busiest_nr_running;
-        int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        int power_savings_balance; /* Is powersave balance needed for this sd */
-        struct sched_group *group_min; /* Least loaded group in sd */
-        struct sched_group *group_leader; /* Group which relieves group_min */
-        unsigned long min_load_per_task; /* load_per_task in group_min */
-        unsigned long leader_nr_running; /* Nr running of group_leader */
-        unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
-        unsigned long avg_load; /*Avg load across the CPUs of the group */
-        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_nr_running; /* Nr tasks running in the group */
-        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-        unsigned long group_capacity;
-        int group_imb; /* Is there an imbalance in the group ? */
-};
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-        return cpumask_first(sched_group_cpus(group));
-}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                        enum cpu_idle_type idle)
-{
-        int load_idx;
-        switch (idle) {
-        case CPU_NOT_IDLE:
-                load_idx = sd->busy_idx;
-                break;
-        case CPU_NEWLY_IDLE:
-                load_idx = sd->newidle_idx;
-                break;
-        default:
-                load_idx = sd->idle_idx;
-                break;
-        }
-        return load_idx;
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        /*
-         * Busy processors will not participate in power savings
-         * balance.
-         */
-        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                sds->power_savings_balance = 0;
-        else {
-                sds->power_savings_balance = 1;
-                sds->min_nr_running = ULONG_MAX;
-                sds->leader_nr_running = 0;
-        }
-}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- *              load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        if (!sds->power_savings_balance)
-                return;
-        /*
-         * If the local group is idle or completely loaded
-         * no need to do power savings balance at this domain
-         */
-        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-                                !sds->this_nr_running))
-                sds->power_savings_balance = 0;
-        /*
-         * If a group is already running at full capacity or idle,
-         * don't include that group in power savings calculations
-         */
-        if (!sds->power_savings_balance ||
-                sgs->sum_nr_running >= sgs->group_capacity ||
-                !sgs->sum_nr_running)
-                return;
-        /*
-         * Calculate the group which has the least non-idle load.
-         * This is the group from where we need to pick up the load
-         * for saving power
-         */
-        if ((sgs->sum_nr_running < sds->min_nr_running) ||
-            (sgs->sum_nr_running == sds->min_nr_running &&
-             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-                sds->group_min = group;
-                sds->min_nr_running = sgs->sum_nr_running;
-                sds->min_load_per_task = sgs->sum_weighted_load /
-                                                sgs->sum_nr_running;
-        }
-        /*
-         * Calculate the group which is almost near its
-         * capacity but still has some space to pick up some load
-         * from other group and save more power
-         */
-        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-                return;
-        if (sgs->sum_nr_running > sds->leader_nr_running ||
-            (sgs->sum_nr_running == sds->leader_nr_running &&
-             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-                sds->group_leader = group;
-                sds->leader_nr_running = sgs->sum_nr_running;
-        }
-}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *      under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        if (!sds->power_savings_balance)
-                return 0;
-        if (sds->this != sds->group_leader ||
-                        sds->group_leader == sds->group_min)
-                return 0;
-        *imbalance = sds->min_load_per_task;
-        sds->busiest = sds->group_min;
-        return 1;
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        return;
-}
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        return;
-}
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return SCHED_LOAD_SCALE;
-}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_freq_power(sd, cpu);
-}
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long smt_gain = sd->smt_gain;
-        smt_gain /= weight;
-        return smt_gain;
-}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_smt_power(sd, cpu);
-}
-unsigned long scale_rt_power(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
-        sched_avg_update(rq);
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
-        if (unlikely((s64)total < SCHED_LOAD_SCALE))
-                total = SCHED_LOAD_SCALE;
-        total >>= SCHED_LOAD_SHIFT;
-        return div_u64(available, total);
-}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long power = SCHED_LOAD_SCALE;
-        struct sched_group *sdg = sd->groups;
-        if (sched_feat(ARCH_POWER))
-                power *= arch_scale_freq_power(sd, cpu);
-        else
-                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                if (sched_feat(ARCH_POWER))
-                        power *= arch_scale_smt_power(sd, cpu);
-                else
-                        power *= default_scale_smt_power(sd, cpu);
-                power >>= SCHED_LOAD_SHIFT;
-        }
-        power *= scale_rt_power(cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if (!power)
-                power = 1;
-        sdg->cpu_power = power;
-}
-static void update_group_power(struct sched_domain *sd, int cpu)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
-        if (!child) {
-                update_cpu_power(sd, cpu);
-                return;
-        }
-        power = 0;
-        group = child->groups;
-        do {
-                power += group->cpu_power;
-                group = group->next;
-        } while (group != child->groups);
-        sdg->cpu_power = power;
-}
-/**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
- * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
- * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sgs: variable to hold the statistics for this group.
- */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
-                        struct sched_group *group, int this_cpu,
-                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
-                        int local_group, const struct cpumask *cpus,
-                        int *balance, struct sg_lb_stats *sgs)
-{
-        unsigned long load, max_cpu_load, min_cpu_load;
-        int i;
-        unsigned int balance_cpu = -1, first_idle_cpu = 0;
-        unsigned long sum_avg_load_per_task;
-        unsigned long avg_load_per_task;
-        if (local_group) {
-                balance_cpu = group_first_cpu(group);
-                if (balance_cpu == this_cpu)
-                        update_group_power(sd, this_cpu);
-        }
-        /* Tally up the load of all CPUs in the group */
-        sum_avg_load_per_task = avg_load_per_task = 0;
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-                struct rq *rq = cpu_rq(i);
-                if (*sd_idle && rq->nr_running)
-                        *sd_idle = 0;
-                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
-                                first_idle_cpu = 1;
-                                balance_cpu = i;
-                        }
-                        load = target_load(i, load_idx);
-                } else {
-                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                }
-                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
-                sgs->sum_weighted_load += weighted_cpuload(i);
-                sum_avg_load_per_task += cpu_avg_load_per_task(i);
-        }
-        /*
-         * First idle cpu or the first cpu(busiest) in this sched group
-         * is eligible for doing load balancing at this and above
-         * domains. In the newly idle case, we will allow all the cpu's
-         * to do the newly idle load balance.
-         */
-        if (idle != CPU_NEWLY_IDLE && local_group &&
-            balance_cpu != this_cpu && balance) {
-                *balance = 0;
-                return;
-        }
-        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of two tasks.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
-        avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
-                group->cpu_power;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-                sgs->group_imb = 1;
-        sgs->group_capacity =
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-}
-/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sds: variable to hold the statistics for this sched_domain.
- */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                        enum cpu_idle_type idle, int *sd_idle,
-                        const struct cpumask *cpus, int *balance,
-                        struct sd_lb_stats *sds)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group = sd->groups;
-        struct sg_lb_stats sgs;
-        int load_idx, prefer_sibling = 0;
-        if (child && child->flags & SD_PREFER_SIBLING)
-                prefer_sibling = 1;
-        init_sd_power_savings_stats(sd, sds, idle);
-        load_idx = get_sd_load_idx(sd, idle);
-        do {
-                int local_group;
-                local_group = cpumask_test_cpu(this_cpu,
-                                               sched_group_cpus(group));
-                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
-                                local_group, cpus, balance, &sgs);
-                if (local_group && balance && !(*balance))
-                        return;
-                sds->total_load += sgs.group_load;
-                sds->total_pwr += group->cpu_power;
-                /*
-                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the group capacity to one so that we'll try
-                 * and move all the excess tasks away.
-                 */
-                if (prefer_sibling)
-                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
-                if (local_group) {
-                        sds->this_load = sgs.avg_load;
-                        sds->this = group;
-                        sds->this_nr_running = sgs.sum_nr_running;
-                        sds->this_load_per_task = sgs.sum_weighted_load;
-                } else if (sgs.avg_load > sds->max_load &&
-                           (sgs.sum_nr_running > sgs.group_capacity ||
-                                sgs.group_imb)) {
-                        sds->max_load = sgs.avg_load;
-                        sds->busiest = group;
-                        sds->busiest_nr_running = sgs.sum_nr_running;
-                        sds->busiest_load_per_task = sgs.sum_weighted_load;
-                        sds->group_imb = sgs.group_imb;
-                }
-                update_sd_power_savings_stats(group, sds, local_group, &sgs);
-                group = group->next;
-        } while (group != sd->groups);
-}
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *                      amongst the groups of a sched_domain, during
- *                      load balancing.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
- */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
-                                int this_cpu, unsigned long *imbalance)
-{
-        unsigned long tmp, pwr_now = 0, pwr_move = 0;
-        unsigned int imbn = 2;
-        if (sds->this_nr_running) {
-                sds->this_load_per_task /= sds->this_nr_running;
-                if (sds->busiest_load_per_task >
-                                sds->this_load_per_task)
-                        imbn = 1;
-        } else
-                sds->this_load_per_task =
-                        cpu_avg_load_per_task(this_cpu);
-        if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
-                        sds->busiest_load_per_task * imbn) {
-                *imbalance = sds->busiest_load_per_task;
-                return;
-        }
-        /*
-         * OK, we don't have enough imbalance to justify moving tasks,
-         * however we may be able to increase total CPU power used by
-         * moving them.
-         */
-        pwr_now += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load);
-        pwr_now += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load);
-        pwr_now /= SCHED_LOAD_SCALE;
-        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                sds->busiest->cpu_power;
-        if (sds->max_load > tmp)
-                pwr_move += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load - tmp);
-        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->cpu_power <
-                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-                tmp = (sds->max_load * sds->busiest->cpu_power) /
-                        sds->this->cpu_power;
-        else
-                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                        sds->this->cpu_power;
-        pwr_move += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load + tmp);
-        pwr_move /= SCHED_LOAD_SCALE;
-        /* Move if we gain throughput */
-        if (pwr_move > pwr_now)
-                *imbalance = sds->busiest_load_per_task;
-}
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *                       groups of a given sched_domain during load balance.
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
- */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
-                unsigned long *imbalance)
-{
-        unsigned long max_pull;
-        /*
-         * In the presence of smp nice balancing, certain scenarios can have
-         * max load less than avg load(as we skip the groups at or below
-         * its cpu_power, while calculating max_load..)
-         */
-        if (sds->max_load < sds->avg_load) {
-                *imbalance = 0;
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-        }
-        /* Don't want to pull so many tasks that a group would go idle */
-        max_pull = min(sds->max_load - sds->avg_load,
-                        sds->max_load - sds->busiest_load_per_task);
-        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->cpu_power,
-                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
-                        / SCHED_LOAD_SCALE;
-        /*
-         * if *imbalance is less than the average load per runnable task
-         * there is no gaurantee that any tasks will be moved so we'll have
-         * a think about bumping its value to force at least one task to be
-         * moved
-         */
-        if (*imbalance < sds->busiest_load_per_task)
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-}
-/******* find_busiest_group() helpers end here *********************/
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
- *
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
- *
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- *      is the appropriate cpu to perform load balancing at this_level.
- *
- * Returns:     - the busiest group if imbalance exists.
- *              - If no imbalance and user has opted for power-savings balance,
- *                 return the least loaded group whose CPUs can be
- *                 put to idle by rebalancing its tasks onto our group.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   int *sd_idle, const struct cpumask *cpus, int *balance)
-{
-        struct sd_lb_stats sds;
-        memset(&sds, 0, sizeof(sds));
-        /*
-         * Compute the various statistics relavent for load balancing at
-         * this level.
-         */
-        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-                                        balance, &sds);
-        /* Cases where imbalance does not exist from POV of this_cpu */
-        /* 1) this_cpu is not the appropriate cpu to perform load balancing
-         *    at this level.
-         * 2) There is no busy sibling group to pull from.
-         * 3) This group is the busiest group.
-         * 4) This group is more busy than the avg busieness at this
-         *    sched_domain.
-         * 5) The imbalance is within the specified limit.
-         * 6) Any rebalance would lead to ping-pong
-         */
-        if (balance && !(*balance))
-                goto ret;
-        if (!sds.busiest || sds.busiest_nr_running == 0)
-                goto out_balanced;
-        if (sds.this_load >= sds.max_load)
-                goto out_balanced;
-        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-        if (sds.this_load >= sds.avg_load)
-                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                goto out_balanced;
-        sds.busiest_load_per_task /= sds.busiest_nr_running;
-        if (sds.group_imb)
-                sds.busiest_load_per_task =
-                        min(sds.busiest_load_per_task, sds.avg_load);
-        /*
-         * We're trying to get all the cpus to the average_load, so we don't
-         * want to push ourselves above the average load, nor do we wish to
-         * reduce the max loaded cpu below the average load, as either of these
-         * actions would just result in more rebalancing later, and ping-pong
-         * tasks around. Thus we look for the minimum possible imbalance.
-         * Negative imbalances (*we* are more loaded than anyone else) will
-         * be counted as no imbalance for these purposes -- we can't fix that
-         * by pulling tasks to us. Be careful of negative numbers as they'll
-         * appear as very large values with unsigned longs.
-         */
-        if (sds.max_load <= sds.busiest_load_per_task)
-                goto out_balanced;
-        /* Looks like there is an imbalance. Compute it */
-        calculate_imbalance(&sds, this_cpu, imbalance);
-        return sds.busiest;
-out_balanced:
-        /*
-         * There is no obvious imbalance. But check if we can do some balancing
-         * to save power.
-         */
-        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-                return sds.busiest;
-ret:
-        *imbalance = 0;
-        return NULL;
-}
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
- */
-static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                   unsigned long imbalance, const struct cpumask *cpus)
-{
-        struct rq *busiest = NULL, *rq;
-        unsigned long max_load = 0;
-        int i;
-        for_each_cpu(i, sched_group_cpus(group)) {
-                unsigned long power = power_of(i);
-                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
-                unsigned long wl;
-                if (!cpumask_test_cpu(i, cpus))
-                        continue;
-                rq = cpu_rq(i);
-                wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
-                wl /= power;
-                if (capacity && rq->nr_running == 1 && wl > imbalance)
-                        continue;
-                if (wl > max_load) {
-                        max_load = wl;
-                        busiest = rq;
-                }
-        }
-        return busiest;
-}
-/*
- * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- * so long as it is large enough.
- */
-#define MAX_PINNED_INTERVAL     512
-/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- */
-static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance)
-{
-        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
-        struct sched_group *group;
-        unsigned long imbalance;
-        struct rq *busiest;
-        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_setall(cpus);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as CPU_IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[idle]);
-redo:
-        update_shares(sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                   cpus, balance);
-        if (*balance == 0)
-                goto out_balanced;
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[idle]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, idle, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[idle]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[idle], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /*
-                 * Attempt to move tasks. If find_busiest_group has found
-                 * an imbalance but busiest->nr_running <= 1, the group is
-                 * still unbalanced. ld_moved simply stays zero, so it is
-                 * correctly treated as an imbalance.
-                 */
-                local_irq_save(flags);
-                double_rq_lock(this_rq, busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                      imbalance, sd, idle, &all_pinned);
-                double_rq_unlock(this_rq, busiest);
-                local_irq_restore(flags);
-                /*
-                 * some other cpu did the load balance for us.
-                 */
-                if (ld_moved && this_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
-                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                        goto out_balanced;
-                }
-        }
-        if (!ld_moved) {
-                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
-                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        spin_lock_irqsave(&busiest->lock, flags);
-                        /* don't kick the migration_thread, if the curr
-                         * task on busiest cpu can't be moved to this_cpu
-                         */
-                        if (!cpumask_test_cpu(this_cpu,
-                                              &busiest->curr->cpus_allowed)) {
-                                spin_unlock_irqrestore(&busiest->lock, flags);
-                                all_pinned = 1;
-                                goto out_one_pinned;
-                        }
-                        if (!busiest->active_balance) {
-                                busiest->active_balance = 1;
-                                busiest->push_cpu = this_cpu;
-                                active_balance = 1;
-                        }
-                        spin_unlock_irqrestore(&busiest->lock, flags);
-                        if (active_balance)
-                                wake_up_process(busiest->migration_thread);
-                        /*
-                         * We've kicked active balancing, reset the failure
-                         * counter.
-                         */
-                        sd->nr_balance_failed = sd->cache_nice_tries+1;
-                }
-        } else
-                sd->nr_balance_failed = 0;
-        if (likely(!active_balance)) {
-                /* We were unbalanced, so reset the balancing interval */
-                sd->balance_interval = sd->min_interval;
-        } else {
-                /*
-                 * If we've begun active balancing, start to back off. This
-                 * case may not be covered by the all_pinned logic if there
-                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
-                 */
-                if (sd->balance_interval < sd->max_interval)
-                        sd->balance_interval *= 2;
-        }
-        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        goto out;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[idle]);
-        sd->nr_balance_failed = 0;
-out_one_pinned:
-        /* tune up the balancing interval */
-        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
-                        (sd->balance_interval < sd->max_interval))
-                sd->balance_interval *= 2;
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        else
-                ld_moved = 0;
-out:
-        if (ld_moved)
-                update_shares(sd);
-        return ld_moved;
-}
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
- * this_rq is locked.
- */
-static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
-{
-        struct sched_group *group;
-        struct rq *busiest = NULL;
-        unsigned long imbalance;
-        int ld_moved = 0;
-        int sd_idle = 0;
-        int all_pinned = 0;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_setall(cpus);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
-redo:
-        update_shares_locked(this_rq, sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
-                                   &sd_idle, cpus, NULL);
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /* Attempt to move tasks */
-                double_lock_balance(this_rq, busiest);
-                /* this_rq->clock is already updated */
-                update_rq_clock(busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                        imbalance, sd, CPU_NEWLY_IDLE,
-                                        &all_pinned);
-                double_unlock_balance(this_rq, busiest);
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                }
-        }
-        if (!ld_moved) {
-                int active_balance = 0;
-                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
-                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                        return -1;
-                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-                        return -1;
-                if (sd->nr_balance_failed++ < 2)
-                        return -1;
-                /*
-                 * The only task running in a non-idle cpu can be moved to this
-                 * cpu in an attempt to completely freeup the other CPU
-                 * package. The same method used to move task in load_balance()
-                 * have been extended for load_balance_newidle() to speedup
-                 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
-                 *
-                 * The package power saving logic comes from
-                 * find_busiest_group().  If there are no imbalance, then
-                 * f_b_g() will return NULL.  However when sched_mc={1,2} then
-                 * f_b_g() will select a group from which a running task may be
-                 * pulled to this cpu in order to make the other package idle.
-                 * If there is no opportunity to make a package idle and if
-                 * there are no imbalance, then f_b_g() will return NULL and no
-                 * action will be taken in load_balance_newidle().
-                 *
-                 * Under normal task pull operation due to imbalance, there
-                 * will be more than one task in the source run queue and
-                 * move_tasks() will succeed.  ld_moved will be true and this
-                 * active balance code will not be triggered.
-                 */
-                /* Lock busiest in correct order while this_rq is held */
-                double_lock_balance(this_rq, busiest);
-                /*
-                 * don't kick the migration_thread, if the curr
-                 * task on busiest cpu can't be moved to this_cpu
-                 */
-                if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
-                        double_unlock_balance(this_rq, busiest);
-                        all_pinned = 1;
-                        return ld_moved;
-                }
-                if (!busiest->active_balance) {
-                        busiest->active_balance = 1;
-                        busiest->push_cpu = this_cpu;
-                        active_balance = 1;
-                }
-                double_unlock_balance(this_rq, busiest);
-                /*
-                 * Should not call ttwu while holding a rq->lock
-                 */
-                spin_unlock(&this_rq->lock);
-                if (active_balance)
-                        wake_up_process(busiest->migration_thread);
-                spin_lock(&this_rq->lock);
-        } else
-                sd->nr_balance_failed = 0;
-        update_shares_locked(this_rq, sd);
-        return ld_moved;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
-        sd->nr_balance_failed = 0;
-        return 0;
-}
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static void idle_balance(int this_cpu, struct rq *this_rq)
-{
-        struct sched_domain *sd;
-        int pulled_task = 0;
-        unsigned long next_balance = jiffies + HZ;
-        for_each_domain(this_cpu, sd) {
-                unsigned long interval;
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                if (sd->flags & SD_BALANCE_NEWIDLE)
-                        /* If we've pulled tasks over stop searching: */
-                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                           sd);
-                interval = msecs_to_jiffies(sd->balance_interval);
-                if (time_after(next_balance, sd->last_balance + interval))
-                        next_balance = sd->last_balance + interval;
-                if (pulled_task)
-                        break;
-        }
-        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-                /*
-                 * We are going idle. next_balance may be set based on
-                 * a busy processor. So reset next_balance.
-                 */
-                this_rq->next_balance = next_balance;
-        }
-}
-/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
- */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
-{
-        int target_cpu = busiest_rq->push_cpu;
-        struct sched_domain *sd;
-        struct rq *target_rq;
-        /* Is there any task to move? */
-        if (busiest_rq->nr_running <= 1)
-                return;
-        target_rq = cpu_rq(target_cpu);
-        /*
-         * This condition is "impossible", if it occurs
-         * we need to fix it. Originally reported by
-         * Bjorn Helgaas on a 128-cpu setup.
-         */
-        BUG_ON(busiest_rq == target_rq);
-        /* move a task from busiest_rq to target_rq */
-        double_lock_balance(busiest_rq, target_rq);
-        update_rq_clock(busiest_rq);
-        update_rq_clock(target_rq);
-        /* Search for an sd spanning us and the target CPU. */
-        for_each_domain(target_cpu, sd) {
-                if ((sd->flags & SD_LOAD_BALANCE) &&
-                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                                break;
-        }
-        if (likely(sd)) {
-                schedstat_inc(sd, alb_count);
-                if (move_one_task(target_rq, target_cpu, busiest_rq,
-                                  sd, CPU_IDLE))
-                        schedstat_inc(sd, alb_pushed);
-                else
-                        schedstat_inc(sd, alb_failed);
-        }
-        double_unlock_balance(busiest_rq, target_rq);
-}
-#ifdef CONFIG_NO_HZ
-static struct {
-        atomic_t load_balancer;
-        cpumask_var_t cpu_mask;
-        cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-        .load_balancer = ATOMIC_INIT(-1),
-};
-int get_nohz_load_balancer(void)
-{
-        return atomic_read(&nohz.load_balancer);
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:        The cpu whose lowest level of sched domain is to
- *              be returned.
- * @flag:       The flag to check for the lowest sched_domain
- *              for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-        struct sched_domain *sd;
-        for_each_domain(cpu, sd)
-                if (sd && (sd->flags & flag))
-                        break;
-        return sd;
-}
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:        The cpu whose domains we're iterating over.
- * @sd:         variable holding the value of the power_savings_sd
- *              for cpu.
- * @flag:       The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-        for (sd = lowest_flag_domain(cpu, flag); \
-                (sd && (sd->flags & flag)); sd = sd->parent)
-/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group:  group to be checked for semi-idleness
- *
- * Returns:     1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
-        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
-                                        sched_group_cpus(ilb_group));
-        /*
-         * A sched_group is semi-idle when it has atleast one busy cpu
-         * and atleast one idle cpu.
-         */
-        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
-                return 0;
-        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
-                return 0;
-        return 1;
-}
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:        The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:     Returns the id of the idle load balancer if it exists,
- *              Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
-{
-        struct sched_domain *sd;
-        struct sched_group *ilb_group;
-        /*
-         * Have idle load balancer selection from semi-idle packages only
-         * when power-aware load balancing is enabled
-         */
-        if (!(sched_smt_power_savings || sched_mc_power_savings))
-                goto out_done;
-        /*
-         * Optimize for the case when we have no idle CPUs or only one
-         * idle CPU. Don't walk the sched_domain hierarchy in such cases
-         */
-        if (cpumask_weight(nohz.cpu_mask) < 2)
-                goto out_done;
-        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilb_group = sd->groups;
-                do {
-                        if (is_semi_idle_group(ilb_group))
-                                return cpumask_first(nohz.ilb_grp_nohz_mask);
-                        ilb_group = ilb_group->next;
-                } while (ilb_group != sd->groups);
-        }
-out_done:
-        return cpumask_first(nohz.cpu_mask);
-}
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-        return cpumask_first(nohz.cpu_mask);
-}
-#endif
-/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
- *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
- *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
- */
-int select_nohz_load_balancer(int stop_tick)
-{
-        int cpu = smp_processor_id();
-        if (stop_tick) {
-                cpu_rq(cpu)->in_nohz_recently = 1;
-                if (!cpu_active(cpu)) {
-                        if (atomic_read(&nohz.load_balancer) != cpu)
-                                return 0;
-                        /*
-                         * If we are going offline and still the leader,
-                         * give up!
-                         */
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-                        return 0;
-                }
-                cpumask_set_cpu(cpu, nohz.cpu_mask);
-                /* time for ilb owner also to sleep */
-                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-                        if (atomic_read(&nohz.load_balancer) == cpu)
-                                atomic_set(&nohz.load_balancer, -1);
-                        return 0;
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /* make me the ilb owner */
-                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu) {
-                        int new_ilb;
-                        if (!(sched_smt_power_savings ||
-                                                sched_mc_power_savings))
-                                return 1;
-                        /*
-                         * Check to see if there is a more power-efficient
-                         * ilb.
-                         */
-                        new_ilb = find_new_ilb(cpu);
-                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                                atomic_set(&nohz.load_balancer, -1);
-                                resched_cpu(new_ilb);
-                                return 0;
-                        }
-                        return 1;
-                }
-        } else {
-                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-                        return 0;
-                cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                if (atomic_read(&nohz.load_balancer) == cpu)
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-        }
-        return 0;
-}
-#endif
-static DEFINE_SPINLOCK(balancing);
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
-{
-        int balance = 1;
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long interval;
-        struct sched_domain *sd;
-        /* Earliest time when we have to do rebalance again */
-        unsigned long next_balance = jiffies + 60*HZ;
-        int update_next_balance = 0;
-        int need_serialize;
-        for_each_domain(cpu, sd) {
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                interval = sd->balance_interval;
-                if (idle != CPU_IDLE)
-                        interval *= sd->busy_factor;
-                /* scale ms to jiffies */
-                interval = msecs_to_jiffies(interval);
-                if (unlikely(!interval))
-                        interval = 1;
-                if (interval > HZ*NR_CPUS/10)
-                        interval = HZ*NR_CPUS/10;
-                need_serialize = sd->flags & SD_SERIALIZE;
-                if (need_serialize) {
-                        if (!spin_trylock(&balancing))
-                                goto out;
-                }
-                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance)) {
-                                /*
-                                 * We've pulled tasks over so either we're no
-                                 * longer idle, or one of our SMT siblings is
-                                 * not idle.
-                                 */
-                                idle = CPU_NOT_IDLE;
-                        }
-                        sd->last_balance = jiffies;
-                }
-                if (need_serialize)
-                        spin_unlock(&balancing);
-out:
-                if (time_after(next_balance, sd->last_balance + interval)) {
-                        next_balance = sd->last_balance + interval;
-                        update_next_balance = 1;
-                }
-                /*
-                 * Stop the load balance at this level. There is another
-                 * CPU in our sched group which is doing load balancing more
-                 * actively.
-                 */
-                if (!balance)
-                        break;
-        }
-        /*
-         * next_balance will be updated only when there is a need.
-         * When the cpu is attached to null domain for ex, it will not be
-         * updated.
-         */
-        if (likely(update_next_balance))
-                rq->next_balance = next_balance;
-}
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static void run_rebalance_domains(struct softirq_action *h)
-{
-        int this_cpu = smp_processor_id();
-        struct rq *this_rq = cpu_rq(this_cpu);
-        enum cpu_idle_type idle = this_rq->idle_at_tick ?
-                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
-        /*
-         * If this cpu is the owner for idle load balancing, then do the
-         * balancing on behalf of the other idle cpus whose ticks are
-         * stopped.
-         */
-        if (this_rq->idle_at_tick &&
-            atomic_read(&nohz.load_balancer) == this_cpu) {
-                struct rq *rq;
-                int balance_cpu;
-                for_each_cpu(balance_cpu, nohz.cpu_mask) {
-                        if (balance_cpu == this_cpu)
-                                continue;
-                        /*
-                         * If this cpu gets work to do, stop the load balancing
-                         * work being done for other cpus. Next load
-                         * balancing owner will pick it up.
-                         */
-                        if (need_resched())
-                                break;
-                        rebalance_domains(balance_cpu, CPU_IDLE);
-                        rq = cpu_rq(balance_cpu);
-                        if (time_after(this_rq->next_balance, rq->next_balance))
-                                this_rq->next_balance = rq->next_balance;
-                }
-        }
-#endif
-}
-static inline int on_null_domain(int cpu)
-{
-        return !rcu_dereference(cpu_rq(cpu)->sd);
-}
-/*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
- */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
-{
-#ifdef CONFIG_NO_HZ
-        /*
-         * If we were in the nohz mode recently and busy at the current
-         * scheduler tick, then check if we need to nominate new idle
-         * load balancer.
-         */
-        if (rq->in_nohz_recently && !rq->idle_at_tick) {
-                rq->in_nohz_recently = 0;
-                if (atomic_read(&nohz.load_balancer) == cpu) {
-                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                        atomic_set(&nohz.load_balancer, -1);
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        int ilb = find_new_ilb(cpu);
-                        if (ilb < nr_cpu_ids)
-                                resched_cpu(ilb);
-                }
-        }
-        /*
-         * If this cpu is idle and doing idle load balancing for all the
-         * cpus with ticks stopped, is it time for that to stop?
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-                resched_cpu(cpu);
-                return;
-        }
-        /*
-         * If this cpu is idle and the idle load balancing is done by
-         * someone else, then no need raise the SCHED_SOFTIRQ
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-            cpumask_test_cpu(cpu, nohz.cpu_mask))
-                return;
-#endif
-        /* Don't need to rebalance while attached to NULL domain */
-        if (time_after_eq(jiffies, rq->next_balance) &&
-            likely(!on_null_domain(cpu)))
-                raise_softirq(SCHED_SOFTIRQ);
-}
-#else   /* CONFIG_SMP */
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5073,8 +3330,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime = cputime_add(p->gtime, cputime);
        /* Add guest time to cpustat. */
-        cpustat->user = cputime64_add(cpustat->user, tmp);
+        if (TASK_NICE(p) > 0) {
-        cpustat->guest = cputime64_add(cpustat->guest, tmp);
+                cpustat->nice = cputime64_add(cpustat->nice, tmp);
+                cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+        } else {
+                cpustat->user = cputime64_add(cpustat->user, tmp);
+                cpustat->guest = cputime64_add(cpustat->guest, tmp);
+        }
 }
 /*
@@ -5189,60 +3451,86 @@ void account_idle_ticks(unsigned long ticks)
 * Use precise platform statistics if available:
 */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-cputime_t task_utime(struct task_struct *p)
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-        return p->utime;
+        *ut = p->utime;
+        *st = p->stime;
 }
-cputime_t task_stime(struct task_struct *p)
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-        return p->stime;
+        struct task_cputime cputime;
+        thread_group_cputime(p, &cputime);
+        *ut = cputime.utime;
+        *st = cputime.stime;
 }
 #else
-cputime_t task_utime(struct task_struct *p)
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
+#endif
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-        clock_t utime = cputime_to_clock_t(p->utime),
+        cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
-                total = utime + cputime_to_clock_t(p->stime);
-        u64 temp;
        /*
         * Use CFS's precise accounting:
         */
-        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
        if (total) {
-                temp *= utime;
+                u64 temp;
+                temp = (u64)(rtime * utime);
                do_div(temp, total);
-        }
+                utime = (cputime_t)temp;
-        utime = (clock_t)temp;
+        } else
+                utime = rtime;
+        /*
+         * Compare with previous values, to keep monotonicity:
+         */
+        p->prev_utime = max(p->prev_utime, utime);
+        p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
-        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+        *ut = p->prev_utime;
-        return p->prev_utime;
+        *st = p->prev_stime;
 }
-cputime_t task_stime(struct task_struct *p)
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-        clock_t stime;
+        struct signal_struct *sig = p->signal;
+        struct task_cputime cputime;
+        cputime_t rtime, utime, total;
-        /*
+        thread_group_cputime(p, &cputime);
-         * Use CFS's precise accounting. (we subtract utime from
-         * the total, to make sure the total observed by userspace
-         * grows monotonically - apps rely on that):
-         */
-        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-                        cputime_to_clock_t(task_utime(p));
-        if (stime >= 0)
+        total = cputime_add(cputime.utime, cputime.stime);
-                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-        return p->prev_stime;
+        if (total) {
-}
+                u64 temp;
-#endif
-inline cputime_t task_gtime(struct task_struct *p)
+                temp = (u64)(rtime * cputime.utime);
-{
+                do_div(temp, total);
-        return p->gtime;
+                utime = (cputime_t)temp;
+        } else
+                utime = rtime;
+        sig->prev_utime = max(sig->prev_utime, utime);
+        sig->prev_stime = max(sig->prev_stime,
+                              cputime_sub(rtime, sig->prev_utime));
+        *ut = sig->prev_utime;
+        *st = sig->prev_stime;
 }
+#endif
 /*
 * This function gets called by the timer code, with HZ frequency.
@@ -5261,7 +3549,7 @@ void scheduler_tick(void)
        TS_TICK_START(current);
-        spin_lock(&rq->lock);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
@@ -5269,9 +3557,9 @@ void scheduler_tick(void)
        /* litmus_tick may force current to resched */
        litmus_tick(rq, curr);
-        spin_unlock(&rq->lock);
+        raw_spin_unlock(&rq->lock);
-        perf_event_task_tick(curr, cpu);
+        perf_event_task_tick(curr);
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@ -5385,13 +3673,14 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
-static void put_prev_task(struct rq *rq, struct task_struct *p)
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
+        if (prev->state == TASK_RUNNING) {
+                u64 runtime = prev->se.sum_exec_runtime;
-        update_avg(&p->se.avg_running, runtime);
+                runtime -= prev->se.prev_sum_exec_runtime;
+                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-        if (p->state == TASK_RUNNING) {
                /*
                 * In order to avoid avg_overlap growing stale when we are
                 * indeed overlapping and hence not getting put to sleep, grow
@@ -5401,12 +3690,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
                 * correlates to the amount of cache footprint a task can
                 * build up.
                 */
-                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+                update_avg(&prev->se.avg_overlap, runtime);
-                update_avg(&p->se.avg_overlap, runtime);
-        } else {
-                update_avg(&p->se.avg_running, 0);
        }
-        p->sched_class->put_prev_task(rq, p);
+        prev->sched_class->put_prev_task(rq, prev);
 }
 /*
@@ -5477,7 +3763,7 @@ need_resched_nonpreemptible:
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
-        spin_lock_irq(&rq->lock);
+        raw_spin_lock_irq(&rq->lock);
        update_rq_clock(rq);
        clear_tsk_need_resched(prev);
@@ -5499,7 +3785,7 @@ need_resched_nonpreemptible:
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
-                perf_event_task_sched_out(prev, next, cpu);
+                perf_event_task_sched_out(prev, next);
                rq->nr_switches++;
                rq->curr = next;
@@ -5517,7 +3803,7 @@ need_resched_nonpreemptible:
                rq = cpu_rq(cpu);
        } else {
                TS_SCHED_END(prev);
-                spin_unlock_irq(&rq->lock);
+                raw_spin_unlock_irq(&rq->lock);
        }
        sched_trace_task_switch_to(current);
@@ -5525,11 +3811,12 @@ need_resched_nonpreemptible:
        post_schedule(rq);
        if (unlikely(reacquire_kernel_lock(current) < 0)) {
+                prev = rq->curr;
+                switch_count = &prev->nivcsw;
                goto need_resched_nonpreemptible;
        }
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
@@ -5538,7 +3825,7 @@ need_resched_nonpreemptible:
 }
 EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 /*
 * Look out! "owner" is an entirely speculative pointer
 * access and not reliable.
@@ -5558,7 +3845,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the mutex owner just released it and exited.
         */
        if (probe_kernel_address(&owner->cpu, cpu))
-                goto out;
+                return 0;
 #else
        cpu = owner->cpu;
 #endif
@@ -5568,14 +3855,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the cpu field may no longer be valid.
         */
        if (cpu >= nr_cpumask_bits)
-                goto out;
+                return 0;
        /*
         * We need to validate that we can do a
         * get_cpu() and that we have the percpu area.
         */
        if (!cpu_online(cpu))
-                goto out;
+                return 0;
        rq = cpu_rq(cpu);
@@ -5594,7 +3881,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                cpu_relax();
        }
-out:
        return 1;
 }
 #endif
@@ -5953,14 +4240,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 */
 bool try_wait_for_completion(struct completion *x)
 {
+        unsigned long flags;
        int ret = 1;
-        spin_lock_irq(&x->wait.lock);
+        spin_lock_irqsave(&x->wait.lock, flags);
        if (!x->done)
                ret = 0;
        else
                x->done--;
-        spin_unlock_irq(&x->wait.lock);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(try_wait_for_completion);
@@ -5975,12 +4263,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
 */
 bool completion_done(struct completion *x)
 {
+        unsigned long flags;
        int ret = 1;
-        spin_lock_irq(&x->wait.lock);
+        spin_lock_irqsave(&x->wait.lock, flags);
        if (!x->done)
                ret = 0;
-        spin_unlock_irq(&x->wait.lock);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
        return ret;
 }
 EXPORT_SYMBOL(completion_done);
@@ -6048,7 +4337,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -6056,6 +4345,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        update_rq_clock(rq);
        oldprio = p->prio;
+        prev_class = p->sched_class;
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -6073,7 +4363,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, oldprio < prio);
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -6117,7 +4407,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, false);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -6140,7 +4430,7 @@ int can_nice(const struct task_struct *p, const int nice)
        /* convert nice value [19,-20] to rlimit style value [1,40] */
        int nice_rlim = 20 - nice;
-        return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
 }
@@ -6243,25 +4533,16 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        BUG_ON(p->se.on_rq);
        p->policy = policy;
-        switch (p->policy) {
-        case SCHED_NORMAL:
-        case SCHED_BATCH:
-        case SCHED_IDLE:
-                p->sched_class = &fair_sched_class;
-                break;
-        case SCHED_FIFO:
-        case SCHED_RR:
-                p->sched_class = &rt_sched_class;
-                break;
-        case SCHED_LITMUS:
-                p->sched_class = &litmus_sched_class;
-                break;
-        }
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
+        if (p->policy == SCHED_LITMUS)
+                p->sched_class = &litmus_sched_class;
+        else if (rt_prio(p->prio))
+                p->sched_class = &rt_sched_class;
+        else
+                p->sched_class = &fair_sched_class;
        set_load_weight(p);
 }
@@ -6286,7 +4567,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        struct rq *rq;
        int reset_on_fork;
@@ -6330,7 +4611,7 @@ recheck:
                        if (!lock_task_sighand(p, &flags))
                                return -ESRCH;
-                        rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                        rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
                        unlock_task_sighand(p, &flags);
                        /* can't set/change the rt policy */
@@ -6384,7 +4665,7 @@ recheck:
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
         */
-        spin_lock_irqsave(&p->pi_lock, flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        /*
         * To be able to change p->policy safely, the apropriate
         * runqueue lock must be held.
@@ -6394,7 +4675,7 @@ recheck:
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
                __task_rq_unlock(rq);
-                spin_unlock_irqrestore(&p->pi_lock, flags);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
        update_rq_clock(rq);
@@ -6411,6 +4692,7 @@ recheck:
                litmus_exit_task(p);
        oldprio = p->prio;
+        prev_class = p->sched_class;
        __setscheduler(rq, p, policy, param->sched_priority);
        if (policy == SCHED_LITMUS) {
@@ -6427,7 +4709,7 @@ recheck:
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
        __task_rq_unlock(rq);
-        spin_unlock_irqrestore(&p->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        rt_mutex_adjust_pi(p);
@@ -6527,7 +4809,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
                return -EINVAL;
        retval = -ESRCH;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (p) {
                retval = security_task_getscheduler(p);
@@ -6535,7 +4817,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
                        retval = p->policy
                                | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
        }
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -6553,7 +4835,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (!param || pid < 0)
                return -EINVAL;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        retval = -ESRCH;
        if (!p)
@@ -6564,7 +4846,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
                goto out_unlock;
        lp.sched_priority = p->rt_priority;
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        /*
         * This one might sleep, we cannot do it with a spinlock held ...
@@ -6574,7 +4856,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        return retval;
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -6585,23 +4867,19 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        int retval;
        get_online_cpus();
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        /* Don't set affinity if task not found and for LITMUS tasks */
        if (!p || is_realtime(p)) {
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                put_online_cpus();
                return p ? -EPERM : -ESRCH;
        }
-        /*
+        /* Prevent p going away */
-         * It is not safe to call set_cpus_allowed with the
-         * tasklist_lock held. We will bump the task_struct's
-         * usage count and then drop tasklist_lock.
-         */
        get_task_struct(p);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                retval = -ENOMEM;
@@ -6682,10 +4960,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        get_online_cpus();
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
@@ -6696,10 +4976,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
+        rq = task_rq_lock(p, &flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        task_rq_unlock(rq, &flags);
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        put_online_cpus();
        return retval;
@@ -6717,7 +4999,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        int ret;
        cpumask_var_t mask;
-        if (len < cpumask_size())
+        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+                return -EINVAL;
+        if (len & (sizeof(unsigned long)-1))
                return -EINVAL;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -6725,10 +5009,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        ret = sched_getaffinity(pid, mask);
        if (ret == 0) {
-                if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                size_t retlen = min_t(size_t, len, cpumask_size());
+                if (copy_to_user(user_mask_ptr, mask, retlen))
                        ret = -EFAULT;
                else
-                        ret = cpumask_size();
+                        ret = retlen;
        }
        free_cpumask_var(mask);
@@ -6754,7 +5040,7 @@ SYSCALL_DEFINE0(sched_yield)
         */
        __release(rq->lock);
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-        _raw_spin_unlock(&rq->lock);
+        do_raw_spin_unlock(&rq->lock);
        preempt_enable_no_resched();
        schedule();
@@ -6934,6 +5220,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 {
        struct task_struct *p;
        unsigned int time_slice;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        struct timespec t;
@@ -6941,7 +5229,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                return -EINVAL;
        retval = -ESRCH;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (!p)
                goto out_unlock;
@@ -6950,15 +5238,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        if (retval)
                goto out_unlock;
-        time_slice = p->sched_class->get_rr_interval(p);
+        rq = task_rq_lock(p, &flags);
+        time_slice = p->sched_class->get_rr_interval(rq, p);
+        task_rq_unlock(rq, &flags);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
        retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
        return retval;
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -7024,7 +5314,7 @@ void show_state_filter(unsigned long state_filter)
        /*
         * Only show locks if all tasks are dumped:
         */
-        if (state_filter == -1)
+        if (!state_filter)
                debug_show_all_locks();
 }
@@ -7046,12 +5336,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __sched_fork(idle);
+        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-        idle->prio = idle->normal_prio = MAX_PRIO;
        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
@@ -7059,7 +5349,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
        idle->oncpu = 1;
 #endif
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT)
@@ -7092,22 +5382,43 @@ cpumask_var_t nohz_cpu_mask;
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
-static inline void sched_init_granularity(void)
+static int get_update_sysctl_factor(void)
 {
-        unsigned int factor = 1 + ilog2(num_online_cpus());
+        unsigned int cpus = min_t(int, num_online_cpus(), 8);
-        const unsigned long limit = 200000000;
+        unsigned int factor;
+        switch (sysctl_sched_tunable_scaling) {
+        case SCHED_TUNABLESCALING_NONE:
+                factor = 1;
+                break;
+        case SCHED_TUNABLESCALING_LINEAR:
+                factor = cpus;
+                break;
+        case SCHED_TUNABLESCALING_LOG:
+        default:
+                factor = 1 + ilog2(cpus);
+                break;
+        }
-        sysctl_sched_min_granularity *= factor;
+        return factor;
-        if (sysctl_sched_min_granularity > limit)
+}
-                sysctl_sched_min_granularity = limit;
-        sysctl_sched_latency *= factor;
+static void update_sysctl(void)
-        if (sysctl_sched_latency > limit)
+{
-                sysctl_sched_latency = limit;
+        unsigned int factor = get_update_sysctl_factor();
-        sysctl_sched_wakeup_granularity *= factor;
+#define SET_SYSCTL(name) \
+        (sysctl_##name = (factor) * normalized_sysctl_##name)
+        SET_SYSCTL(sched_min_granularity);
+        SET_SYSCTL(sched_latency);
+        SET_SYSCTL(sched_wakeup_granularity);
+        SET_SYSCTL(sched_shares_ratelimit);
+#undef SET_SYSCTL
+}
-        sysctl_sched_shares_ratelimit *= factor;
+static inline void sched_init_granularity(void)
+{
+        update_sysctl();
 }
 #ifdef CONFIG_SMP
@@ -7144,7 +5455,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        int ret = 0;
        rq = task_rq_lock(p, &flags);
-        if (!cpumask_intersects(new_mask, cpu_online_mask)) {
+        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
@@ -7166,13 +5478,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
-        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
+        if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                struct task_struct *mt = rq->migration_thread;
                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(rq->migration_thread);
+                wake_up_process(mt);
                put_task_struct(mt);
                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
@@ -7199,7 +5511,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
        struct rq *rq_dest, *rq_src;
-        int ret = 0, on_rq;
+        int ret = 0;
        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
@@ -7215,12 +5527,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                goto fail;
-        on_rq = p->se.on_rq;
+        /*
-        if (on_rq)
+         * If we're not on a rq, the next wake-up will ensure we're
+         * placed properly.
+         */
+        if (p->se.on_rq) {
                deactivate_task(rq_src, p, 0);
+                set_task_cpu(p, dest_cpu);
-        set_task_cpu(p, dest_cpu);
-        if (on_rq) {
                activate_task(rq_dest, p, 0);
                check_preempt_curr(rq_dest, p, 0);
        }
@@ -7255,10 +5568,10 @@ static int migration_thread(void *data)
                struct migration_req *req;
                struct list_head *head;
-                spin_lock_irq(&rq->lock);
+                raw_spin_lock_irq(&rq->lock);
                if (cpu_is_offline(cpu)) {
-                        spin_unlock_irq(&rq->lock);
+                        raw_spin_unlock_irq(&rq->lock);
                        break;
                }
@@ -7270,7 +5583,7 @@ static int migration_thread(void *data)
                head = &rq->migration_queue;
                if (list_empty(head)) {
-                        spin_unlock_irq(&rq->lock);
+                        raw_spin_unlock_irq(&rq->lock);
                        schedule();
                        set_current_state(TASK_INTERRUPTIBLE);
                        continue;
@@ -7279,14 +5592,14 @@ static int migration_thread(void *data)
                list_del_init(head->next);
                if (req->task != NULL) {
-                        spin_unlock(&rq->lock);
+                        raw_spin_unlock(&rq->lock);
                        __migrate_task(req->task, cpu, req->dest_cpu);
                } else if (likely(cpu == (badcpu = smp_processor_id()))) {
                        req->dest_cpu = RCU_MIGRATION_GOT_QS;
-                        spin_unlock(&rq->lock);
+                        raw_spin_unlock(&rq->lock);
                } else {
                        req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-                        spin_unlock(&rq->lock);
+                        raw_spin_unlock(&rq->lock);
                        WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
                }
                local_irq_enable();
@@ -7316,37 +5629,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
        int dest_cpu;
-        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
 again:
-        /* Look for allowed, online CPU in same node. */
+        dest_cpu = select_fallback_rq(dead_cpu, p);
-        for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
-                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
-                        goto move;
-        /* Any allowed, online CPU? */
-        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
-        if (dest_cpu < nr_cpu_ids)
-                goto move;
-        /* No more Mr. Nice Guy. */
-        if (dest_cpu >= nr_cpu_ids) {
-                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-                dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
-                /*
-                 * Don't tell them about moving exiting tasks or
-                 * kernel threads (both mm NULL), since they never
-                 * leave kernel.
-                 */
-                if (p->mm && printk_ratelimit()) {
-                        printk(KERN_INFO "process %d (%s) no "
-                               "longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, dead_cpu);
-                }
-        }
-move:
        /* It can have affinity changed while we were choosing. */
        if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
                goto again;
@@ -7361,7 +5647,7 @@ move:
 */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
+        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
        unsigned long flags;
        local_irq_save(flags);
@@ -7409,14 +5695,14 @@ void sched_idle_next(void)
         * Strictly not necessary since rest of the CPUs are stopped by now
         * and interrupts disabled on the current cpu.
         */
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
        update_rq_clock(rq);
        activate_task(rq, p, 0);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -7452,9 +5738,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
         * that's OK. No task can be added to this CPU, so iteration is
         * fine.
         */
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
        move_task_off_dead_cpu(dead_cpu, p);
-        spin_lock_irq(&rq->lock);
+        raw_spin_lock_irq(&rq->lock);
        put_task_struct(p);
 }
@@ -7495,17 +5781,16 @@ static struct ctl_table sd_ctl_dir[] = {
                .procname       = "sched_domain",
                .mode           = 0555,
        },
-        {0, },
+        {}
 };
 static struct ctl_table sd_ctl_root[] = {
        {
-                .ctl_name       = CTL_KERN,
                .procname       = "kernel",
                .mode           = 0555,
                .child          = sd_ctl_dir,
        },
-        {0, },
+        {}
 };
 static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7615,7 +5900,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
-        int i, cpu_num = num_online_cpus();
+        int i, cpu_num = num_possible_cpus();
        struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
        char buf[32];
@@ -7625,7 +5910,7 @@ static void register_sched_domain_sysctl(void)
        if (entry == NULL)
                return;
-        for_each_online_cpu(i) {
+        for_each_possible_cpu(i) {
                snprintf(buf, 32, "cpu%d", i);
                entry->procname = kstrdup(buf, GFP_KERNEL);
                entry->mode = 0555;
@@ -7721,13 +6006,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                /* Update our root-domain */
                rq = cpu_rq(cpu);
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_online(rq);
                }
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -7752,14 +6037,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                put_task_struct(rq->migration_thread);
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
-                spin_lock_irq(&rq->lock);
+                raw_spin_lock_irq(&rq->lock);
                update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
-                rq->idle->static_prio = MAX_PRIO;
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
-                spin_unlock_irq(&rq->lock);
+                raw_spin_unlock_irq(&rq->lock);
                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
@@ -7769,30 +6053,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 * they didn't take sched_hotcpu_mutex. Just wake up
                 * the requestors.
                 */
-                spin_lock_irq(&rq->lock);
+                raw_spin_lock_irq(&rq->lock);
                while (!list_empty(&rq->migration_queue)) {
                        struct migration_req *req;
                        req = list_entry(rq->migration_queue.next,
                                         struct migration_req, list);
                        list_del_init(&req->list);
-                        spin_unlock_irq(&rq->lock);
+                        raw_spin_unlock_irq(&rq->lock);
                        complete(&req->done);
-                        spin_lock_irq(&rq->lock);
+                        raw_spin_lock_irq(&rq->lock);
                }
-                spin_unlock_irq(&rq->lock);
+                raw_spin_unlock_irq(&rq->lock);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /* Update our root-domain */
                rq = cpu_rq(cpu);
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                break;
 #endif
        }
@@ -7829,6 +6113,16 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
+static __read_mostly int sched_domain_debug_enabled;
+static int __init sched_domain_debug_setup(char *str)
+{
+        sched_domain_debug_enabled = 1;
+        return 0;
+}
+early_param("sched_debug", sched_domain_debug_setup);
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
 {
@@ -7915,6 +6209,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        cpumask_var_t groupmask;
        int level = 0;
+        if (!sched_domain_debug_enabled)
+                return;
        if (!sd) {
                printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
                return;
@@ -7994,6 +6291,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 static void free_rootdomain(struct root_domain *rd)
 {
+        synchronize_sched();
        cpupri_cleanup(&rd->cpupri);
        free_cpumask_var(rd->rto_mask);
@@ -8007,7 +6306,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        struct root_domain *old_rd = NULL;
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        if (rq->rd) {
                old_rd = rq->rd;
@@ -8033,7 +6332,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                set_rq_online(rq);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
        if (old_rd)
                free_rootdomain(old_rd);
@@ -8134,6 +6433,7 @@ static cpumask_var_t cpu_isolated_map;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
+        alloc_bootmem_cpumask_var(&cpu_isolated_map);
        cpulist_parse(str, cpu_isolated_map);
        return 1;
 }
@@ -8318,14 +6618,14 @@ enum s_alloc {
 */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
 static int
 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
                 struct sched_group **sg, struct cpumask *unused)
 {
        if (sg)
-                *sg = &per_cpu(sched_group_cpus, cpu).sg;
+                *sg = &per_cpu(sched_groups, cpu).sg;
        return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
@@ -8970,7 +7270,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
        return __build_sched_domains(cpu_map, NULL);
 }
-static struct cpumask *doms_cur;        /* current sched domains */
+static cpumask_var_t *doms_cur; /* current sched domains */
 static int ndoms_cur;           /* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
                                /* attribues of custom domains in 'doms_cur' */
@@ -8992,6 +7292,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
        return 0;
 }
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+        int i;
+        cpumask_var_t *doms;
+        doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+        if (!doms)
+                return NULL;
+        for (i = 0; i < ndoms; i++) {
+                if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+                        free_sched_domains(doms, i);
+                        return NULL;
+                }
+        }
+        return doms;
+}
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+        unsigned int i;
+        for (i = 0; i < ndoms; i++)
+                free_cpumask_var(doms[i]);
+        kfree(doms);
+}
 /*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated cpus, but could be used to
@@ -9003,12 +7328,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
        arch_update_cpu_topology();
        ndoms_cur = 1;
-        doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
+        doms_cur = alloc_sched_domains(ndoms_cur);
        if (!doms_cur)
-                doms_cur = fallback_doms;
+                doms_cur = &fallback_doms;
-        cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
+        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
-        err = build_sched_domains(doms_cur);
+        err = build_sched_domains(doms_cur[0]);
        register_sched_domain_sysctl();
        return err;
@@ -9058,19 +7383,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * doms_new[] to the current sched domain partitioning, doms_cur[].
 * It destroys each deleted domain and builds each new domain.
 *
- * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
 * The masks don't intersect (don't overlap.) We should setup one
 * sched domain for each mask. CPUs not in any of the cpumasks will
 * not be load balanced. If the same cpumask appears both in the
 * current 'doms_cur' domains and in the new 'doms_new', we can leave
 * it as it is.
 *
- * The passed in 'doms_new' should be kmalloc'd. This routine takes
+ * The passed in 'doms_new' should be allocated using
- * ownership of it and will kfree it when done with it. If the caller
+ * alloc_sched_domains.  This routine takes ownership of it and will
- * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * free_sched_domains it when done with it. If the caller failed the
- * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * the single partition 'fallback_doms', it also forces the domains
+ * and partition_sched_domains() will fallback to the single partition
- * to be rebuilt.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
 *
 * If doms_new == NULL it will be replaced with cpu_online_mask.
 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -9078,8 +7403,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 *
 * Call with hotplug lock held
 */
-/* FIXME: Change to struct cpumask *doms_new[] */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
        int i, j, n;
@@ -9098,40 +7422,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
                for (j = 0; j < n && !new_topology; j++) {
-                        if (cpumask_equal(&doms_cur[i], &doms_new[j])
+                        if (cpumask_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
                }
                /* no match - a current sched domain not in new doms_new[] */
-                detach_destroy_domains(doms_cur + i);
+                detach_destroy_domains(doms_cur[i]);
 match1:
                ;
        }
        if (doms_new == NULL) {
                ndoms_cur = 0;
-                doms_new = fallback_doms;
+                doms_new = &fallback_doms;
-                cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+                cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
        }
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                        if (cpumask_equal(&doms_new[i], &doms_cur[j])
+                        if (cpumask_equal(doms_new[i], doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
                }
                /* no match - add a new doms_new */
-                __build_sched_domains(doms_new + i,
+                __build_sched_domains(doms_new[i],
                                        dattr_new ? dattr_new + i : NULL);
 match2:
                ;
        }
        /* Remember the new sched domains */
-        if (doms_cur != fallback_doms)
+        if (doms_cur != &fallback_doms)
-                kfree(doms_cur);
+                free_sched_domains(doms_cur, ndoms_cur);
        kfree(dattr_cur);       /* kfree(NULL) is safe */
        doms_cur = doms_new;
        dattr_cur = dattr_new;
@@ -9183,11 +7507,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
                                           char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                            struct sysdev_class_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
@@ -9199,11 +7525,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
                                            char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                             struct sysdev_class_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
@@ -9242,8 +7570,10 @@ static int update_sched_domains(struct notifier_block *nfb,
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
@@ -9290,7 +7620,7 @@ void __init sched_init_smp(void)
 #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-        arch_init_sched_domains(cpu_online_mask);
+        arch_init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9363,13 +7693,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-        plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
+        plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
 #endif
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
        rt_rq->rt_runtime = 0;
-        spin_lock_init(&rt_rq->rt_runtime_lock);
+        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 #ifdef CONFIG_RT_GROUP_SCHED
        rt_rq->rt_nr_boosted = 0;
@@ -9416,7 +7746,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        tg->rt_rq[cpu] = rt_rq;
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
-        rt_rq->rt_se = rt_se;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
        if (add)
                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9447,16 +7776,9 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_USER_SCHED
-        alloc_size *= 2;
-#endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
        alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-        /*
-         * As sched_init() is called before page_alloc is setup,
-         * we use alloc_bootmem().
-         */
        if (alloc_size) {
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -9467,13 +7789,6 @@ void __init sched_init(void)
                init_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.se = (struct sched_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9482,13 +7797,6 @@ void __init sched_init(void)
                init_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.rt_rq = (struct rt_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
@@ -9508,22 +7816,13 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&init_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_USER_SCHED
-        init_rt_bandwidth(&root_task_group.rt_bandwidth,
-                        global_rt_period(), RUNTIME_INF);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
        INIT_LIST_HEAD(&init_task_group.children);
-#ifdef CONFIG_USER_SCHED
+#endif /* CONFIG_CGROUP_SCHED */
-        INIT_LIST_HEAD(&root_task_group.children);
-        init_task_group.parent = &root_task_group;
-        list_add(&init_task_group.siblings, &root_task_group.children);
-#endif /* CONFIG_USER_SCHED */
-#endif /* CONFIG_GROUP_SCHED */
 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9533,7 +7832,7 @@ void __init sched_init(void)
                struct rq *rq;
                rq = cpu_rq(i);
-                spin_lock_init(&rq->lock);
+                raw_spin_lock_init(&rq->lock);
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9563,25 +7862,6 @@ void __init sched_init(void)
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                 */
                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                root_task_group.shares = NICE_0_LOAD;
-                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
-                /*
-                 * In case of task-groups formed thr' the user id of tasks,
-                 * init_task_group represents tasks belonging to root user.
-                 * Hence it forms a sibling of all subsequent groups formed.
-                 * In this case, init_task_group gets only a fraction of overall
-                 * system cpu resource, based on the weight assigned to root
-                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
-                 * by letting tasks of init_task_group sit in a separate cfs_rq
-                 * (init_tg_cfs_rq) and having one entity represent this group of
-                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
-                 */
-                init_tg_cfs_entry(&init_task_group,
-                                &per_cpu(init_tg_cfs_rq, i),
-                                &per_cpu(init_sched_entity, i), i, 1,
-                                root_task_group.se[i]);
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -9590,12 +7870,6 @@ void __init sched_init(void)
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
-                init_tg_rt_entry(&init_task_group,
-                                &per_cpu(init_rt_rq, i),
-                                &per_cpu(init_sched_rt_entity, i), i, 1,
-                                root_task_group.rt_se[i]);
 #endif
 #endif
@@ -9611,6 +7885,8 @@ void __init sched_init(void)
                rq->cpu = i;
                rq->online = 0;
                rq->migration_thread = NULL;
+                rq->idle_stamp = 0;
+                rq->avg_idle = 2*sysctl_sched_migration_cost;
                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
 #endif
@@ -9629,7 +7905,7 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+        plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
        /*
@@ -9660,7 +7936,9 @@ void __init sched_init(void)
        zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
        alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-        zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+        /* May be allocated at isolcpus cmdline parse time */
+        if (cpu_isolated_map == NULL)
+                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
        perf_event_init();
@@ -9671,12 +7949,12 @@ void __init sched_init(void)
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
-        int nested = preempt_count() & ~PREEMPT_ACTIVE;
+        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
-void __might_sleep(char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
@@ -9752,13 +8030,13 @@ void normalize_rt_tasks(void)
                        continue;
                }
-                spin_lock(&p->pi_lock);
+                raw_spin_lock(&p->pi_lock);
                rq = __task_rq_lock(p);
                normalize_task(rq, p);
                __task_rq_unlock(rq);
-                spin_unlock(&p->pi_lock);
+                raw_spin_unlock(&p->pi_lock);
        } while_each_thread(g, p);
        read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9854,13 +8132,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                se = kzalloc_node(sizeof(struct sched_entity),
                                  GFP_KERNEL, cpu_to_node(i));
                if (!se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(cfs_rq);
 err:
        return 0;
 }
@@ -9942,13 +8222,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(rt_rq);
 err:
        return 0;
 }
@@ -9983,7 +8265,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
@@ -10082,17 +8364,17 @@ void sched_move_task(struct task_struct *tsk)
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->moved_group)
-                tsk->sched_class->moved_group(tsk);
+                tsk->sched_class->moved_group(tsk, on_rq);
 #endif
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
-                enqueue_task(rq, tsk, 0);
+                enqueue_task(rq, tsk, 0, false);
        task_rq_unlock(rq, &flags);
 }
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10117,9 +8399,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
        struct rq *rq = cfs_rq->rq;
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __set_se_shares(se, shares);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 static DEFINE_MUTEX(shares_mutex);
@@ -10234,13 +8516,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
                runtime = d->rt_runtime;
        }
-#ifdef CONFIG_USER_SCHED
-        if (tg == &root_task_group) {
-                period = global_rt_period();
-                runtime = global_rt_runtime();
-        }
-#endif
        /*
         * Cannot have more runtime than the period.
         */
@@ -10304,18 +8579,18 @@ static int tg_set_bandwidth(struct task_group *tg,
        if (err)
                goto unlock;
-        spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+        raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
        tg->rt_bandwidth.rt_runtime = rt_runtime;
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = tg->rt_rq[i];
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = rt_runtime;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
-        spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+        raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 unlock:
        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
@@ -10420,15 +8695,15 @@ static int sched_rt_global_constraints(void)
        if (sysctl_sched_rt_runtime == 0)
                return -EBUSY;
-        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = global_rt_runtime();
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
-        spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
        return 0;
 }
@@ -10643,7 +8918,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 *cpuusage;
+        u64 __percpu *cpuusage;
        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
@@ -10719,9 +8994,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
        /*
         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
         */
-        spin_lock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
        data = *cpuusage;
-        spin_unlock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
        data = *cpuusage;
 #endif
@@ -10737,9 +9012,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
        /*
         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
         */
-        spin_lock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
        *cpuusage = val;
-        spin_unlock_irq(&cpu_rq(cpu)->lock);
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
        *cpuusage = val;
 #endif
@@ -10860,12 +9135,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 }
 /*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH   \
+        min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH   0
+#endif
+/*
 * Charge the system/user time to the task's accounting group.
 */
 static void cpuacct_update_stats(struct task_struct *tsk,
                enum cpuacct_stat_index idx, cputime_t val)
 {
        struct cpuacct *ca;
+        int batch = CPUACCT_BATCH;
        if (unlikely(!cpuacct_subsys.active))
                return;
@@ -10874,7 +9167,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
        ca = task_ca(tsk);
        do {
-                percpu_counter_add(&ca->cpustat[idx], val);
+                __percpu_counter_add(&ca->cpustat[idx], val, batch);
                ca = ca->parent;
        } while (ca);
        rcu_read_unlock();
@@ -10973,9 +9266,9 @@ void synchronize_sched_expedited(void)
                init_completion(&req->done);
                req->task = NULL;
                req->dest_cpu = RCU_MIGRATION_NEED_QS;
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                list_add(&req->list, &rq->migration_queue);
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
                wake_up_process(rq->migration_thread);
        }
        for_each_online_cpu(cpu) {
@@ -10983,13 +9276,14 @@ void synchronize_sched_expedited(void)
                req = &per_cpu(rcu_migration_req, cpu);
                rq = cpu_rq(cpu);
                wait_for_completion(&req->done);
-                spin_lock_irqsave(&rq->lock, flags);
+                raw_spin_lock_irqsave(&rq->lock, flags);
                if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
                        need_full_sync = 1;
                req->dest_cpu = RCU_MIGRATION_IDLE;
-                spin_unlock_irqrestore(&rq->lock, flags);
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
        rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+        synchronize_sched_expedited_count++;
        mutex_unlock(&rcu_sched_expedited_mutex);
        put_online_cpus();
        if (need_full_sync)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7c..5b496132c28a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+unsigned long long cpu_clock(int cpu)
+{
+        unsigned long long clock;
+        unsigned long flags;
+        local_irq_save(flags);
+        clock = sched_clock_cpu(cpu);
+        local_irq_restore(flags);
+        return clock;
+}
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
        return sched_clock();
 }
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 unsigned long long cpu_clock(int cpu)
 {
-        unsigned long long clock;
+        return sched_clock_cpu(cpu);
-        unsigned long flags;
+}
-        local_irq_save(flags);
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-        clock = sched_clock_cpu(cpu);
-        local_irq_restore(flags);
-        return clock;
-}
 EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d5..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
 *  of the License.
 */
+#include <linux/gfp.h>
 #include "sched_cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,9 +48,7 @@ static int convert_prio(int prio)
 }
 #define for_each_cpupri_active(array, idx)                    \
-  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
-       idx < CPUPRI_NR_PRIORITIES;                            \
-       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +57,7 @@ static int convert_prio(int prio)
 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
- * current invokation.  By the time the call returns, the CPUs may have in
+ * current invocation.  By the time the call returns, the CPUs may have in
 * fact changed priorities any number of times.  While not ideal, it is not
 * an issue of correctness since the normal rebalancer logic will correct
 * any discrepancies created by racing against the uncertainty of the current
@@ -135,26 +134,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
        if (likely(newpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-                spin_lock_irqsave(&vec->lock, flags);
+                raw_spin_lock_irqsave(&vec->lock, flags);
                cpumask_set_cpu(cpu, vec->mask);
                vec->count++;
                if (vec->count == 1)
                        set_bit(newpri, cp->pri_active);
-                spin_unlock_irqrestore(&vec->lock, flags);
+                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        if (likely(oldpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-                spin_lock_irqsave(&vec->lock, flags);
+                raw_spin_lock_irqsave(&vec->lock, flags);
                vec->count--;
                if (!vec->count)
                        clear_bit(oldpri, cp->pri_active);
                cpumask_clear_cpu(cpu, vec->mask);
-                spin_unlock_irqrestore(&vec->lock, flags);
+                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        *currpri = newpri;
@@ -180,7 +179,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-                spin_lock_init(&vec->lock);
+                raw_spin_lock_init(&vec->lock);
                vec->count = 0;
                if (!zalloc_cpumask_var(&vec->mask, gfp))
                        goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fbf..7cb5bb6b95be 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
 /* values 2-101 are RT priorities 0-99 */
 struct cpupri_vec {
-        spinlock_t lock;
+        raw_spinlock_t lock;
        int        count;
        cpumask_var_t mask;
 };
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..19be00ba6123 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        {
                char path[64];
+                rcu_read_lock();
                cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+                rcu_read_unlock();
                SEQ_printf(m, " %s", path);
        }
 #endif
@@ -184,7 +186,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        if (cfs_rq->rb_leftmost)
                MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
        last = __pick_last_entity(cfs_rq);
@@ -192,7 +194,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                max_vruntime = last->vruntime;
        min_vruntime = cfs_rq->min_vruntime;
        rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
                        SPLIT_NS(MIN_vruntime));
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -285,12 +287,16 @@ static void print_cpu(struct seq_file *m, int cpu)
 #ifdef CONFIG_SCHEDSTATS
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
        P(yld_count);
        P(sched_switch);
        P(sched_count);
        P(sched_goidle);
+#ifdef CONFIG_SMP
+        P64(avg_idle);
+#endif
        P(ttwu_count);
        P(ttwu_local);
@@ -305,6 +311,12 @@ static void print_cpu(struct seq_file *m, int cpu)
        print_rq(m, rq, cpu);
 }
+static const char *sched_tunable_scaling_names[] = {
+        "none",
+        "logaritmic",
+        "linear"
+};
 static int sched_debug_show(struct seq_file *m, void *v)
 {
        u64 now = ktime_to_ns(ktime_get());
@@ -330,6 +342,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
+        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+                sysctl_sched_tunable_scaling,
+                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
        for_each_online_cpu(cpu)
                print_cpu(m, cpu);
@@ -395,7 +411,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.sum_exec_runtime);
        PN(se.avg_overlap);
        PN(se.avg_wakeup);
-        PN(se.avg_running);
        nr_switches = p->nvcsw + p->nivcsw;
@@ -419,7 +434,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.nr_failed_migrations_running);
        P(se.nr_failed_migrations_hot);
        P(se.nr_forced_migrations);
-        P(se.nr_forced2_migrations);
        P(se.nr_wakeups);
        P(se.nr_wakeups_sync);
        P(se.nr_wakeups_migrate);
@@ -495,7 +509,6 @@ void proc_sched_set_task(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
        p->se.nr_wakeups_migrate                = 0;
@@ -507,8 +520,4 @@ void proc_sched_set_task(struct task_struct *p)
        p->se.nr_wakeups_idle                   = 0;
        p->sched_info.bkl_count                 = 0;
 #endif
-        p->se.sum_exec_runtime                  = 0;
-        p->se.prev_sum_exec_runtime             = 0;
-        p->nvcsw                                = 0;
-        p->nivcsw                               = 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ef43ff95999d..b1af6d42c024 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
 */
 #include <linux/latencytop.h>
+#include <linux/sched.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
 *  run vmstat and monitor the context-switches (cs) field)
 */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+        = SCHED_TUNABLESCALING_LOG;
 /*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 * have immediate wakeup/sleep latencies.
 */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 */
 #ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
                                        sysctl_sched_min_granularity);
+#define WRT_SYSCTL(name) \
+        (normalized_sysctl_##name = sysctl_##name / (factor))
+        WRT_SYSCTL(sched_min_granularity);
+        WRT_SYSCTL(sched_latency);
+        WRT_SYSCTL(sched_wakeup_granularity);
+        WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
        return 0;
 }
 #endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
 }
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
+#define ENQUEUE_WAKEUP  1
+#define ENQUEUE_MIGRATE 2
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
+         * Update the normalized vruntime before updating min_vruntime
+         * through callig update_curr().
+         */
+        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+                se->vruntime += cfs_rq->min_vruntime;
+        /*
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
        account_entity_enqueue(cfs_rq, se);
-        if (wakeup) {
+        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
                enqueue_sleeper(cfs_rq, se);
        }
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+        /*
+         * Normalize the entity after updating the min_vruntime because the
+         * update can refer to the ->curr item and we need to reflect this
+         * movement in our normalized position.
+         */
+        if (!sleep)
+                se->vruntime -= cfs_rq->min_vruntime;
 }
 /*
@@ -1009,17 +1053,24 @@ static inline void hrtick_update(struct rq *rq)
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+        int flags = 0;
+        if (wakeup)
+                flags |= ENQUEUE_WAKEUP;
+        if (p->state == TASK_WAKING)
+                flags |= ENQUEUE_MIGRATE;
        for_each_sched_entity(se) {
                if (se->on_rq)
                        break;
                cfs_rq = cfs_rq_of(se);
-                enqueue_entity(cfs_rq, se, wakeup);
+                enqueue_entity(cfs_rq, se, flags);
-                wakeup = 1;
+                flags = ENQUEUE_WAKEUP;
        }
        hrtick_update(rq);
@@ -1095,6 +1146,14 @@ static void yield_task_fair(struct rq *rq)
 #ifdef CONFIG_SMP
+static void task_waking_fair(struct rq *rq, struct task_struct *p)
+{
+        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        se->vruntime -= cfs_rq->min_vruntime;
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
 * effective_load() calculates the load change as seen from the root_task_group
@@ -1345,6 +1404,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int i;
+        /*
+         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * always a better target than the current cpu.
+         */
+        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+                return prev_cpu;
+        /*
+         * Otherwise, iterate the domain and find an elegible idle cpu.
+         */
+        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                if (!cpu_rq(i)->cfs.nr_running) {
+                        target = i;
+                        break;
+                }
+        }
+        return target;
+}
+/*
 * sched_balance_self: balance the current task (running on cpu) in domains
 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
 * SD_BALANCE_EXEC.
@@ -1372,8 +1462,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                new_cpu = prev_cpu;
        }
-        rcu_read_lock();
        for_each_domain(cpu, tmp) {
+                if (!(tmp->flags & SD_LOAD_BALANCE))
+                        continue;
                /*
                 * If power savings logic is enabled for a domain, see if we
                 * are not overloaded, if so, don't balance wider.
@@ -1398,11 +1490,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                                want_sd = 0;
                }
-                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                /*
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                 * While iterating the domains looking for a spanning
+                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * in cache sharing domains along the way.
+                 */
+                if (want_affine) {
+                        int target = -1;
+                        /*
+                         * If both cpu and prev_cpu are part of this domain,
+                         * cpu is a valid SD_WAKE_AFFINE target.
+                         */
+                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                                target = cpu;
-                        affine_sd = tmp;
+                        /*
-                        want_affine = 0;
+                         * If there's an idle sibling in this domain, make that
+                         * the wake_affine target instead of the current cpu.
+                         */
+                        if (tmp->flags & SD_SHARE_PKG_RESOURCES)
+                                target = select_idle_sibling(p, tmp, target);
+                        if (target >= 0) {
+                                if (tmp->flags & SD_WAKE_AFFINE) {
+                                        affine_sd = tmp;
+                                        want_affine = 0;
+                                }
+                                cpu = target;
+                        }
                }
                if (!want_sd && !want_affine)
@@ -1429,10 +1545,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        update_shares(tmp);
        }
-        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+        if (affine_sd && wake_affine(affine_sd, p, sync))
-                new_cpu = cpu;
+                return cpu;
-                goto out;
-        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1473,8 +1587,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* while loop will break here if sd == NULL */
        }
-out:
-        rcu_read_unlock();
        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1596,12 +1708,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        update_curr(cfs_rq);
+        if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
+                goto preempt;
-        if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) {
-                resched_task(curr);
-                return;
-        }
        if (unlikely(p->sched_class != &fair_sched_class))
                return;
@@ -1627,50 +1735,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE)) {
+        if (unlikely(curr->policy == SCHED_IDLE))
-                resched_task(curr);
+                goto preempt;
-                return;
-        }
-        if ((sched_feat(WAKEUP_SYNC) && sync) ||
+        if (sched_feat(WAKEUP_SYNC) && sync)
-            (sched_feat(WAKEUP_OVERLAP) &&
+                goto preempt;
-             (se->avg_overlap < sysctl_sched_migration_cost &&
-              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                resched_task(curr);
-                return;
-        }
-        if (sched_feat(WAKEUP_RUNNING)) {
+        if (sched_feat(WAKEUP_OVERLAP) &&
-                if (pse->avg_running < se->avg_running) {
+                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        set_next_buddy(pse);
+                        pse->avg_overlap < sysctl_sched_migration_cost)
-                        resched_task(curr);
+                goto preempt;
-                        return;
-                }
-        }
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
+        if (wakeup_preempt_entity(se, pse) == 1)
+                goto preempt;
-        if (wakeup_preempt_entity(se, pse) == 1) {
+        return;
-                resched_task(curr);
-                /*
+preempt:
-                 * Only set the backward buddy when the current task is still
+        resched_task(curr);
-                 * on the rq. This can happen when a wakeup gets interleaved
+        /*
-                 * with schedule on the ->pre_schedule() or idle_balance()
+         * Only set the backward buddy when the current task is still
-                 * point, either of which can * drop the rq lock.
+         * on the rq. This can happen when a wakeup gets interleaved
-                 *
+         * with schedule on the ->pre_schedule() or idle_balance()
-                 * Also, during early boot the idle thread is in the fair class,
+         * point, either of which can * drop the rq lock.
-                 * for obvious reasons its a bad idea to schedule back to it.
+         *
-                 */
+         * Also, during early boot the idle thread is in the fair class,
-                if (unlikely(!se->on_rq || curr == rq->idle))
+         * for obvious reasons its a bad idea to schedule back to it.
-                        return;
+         */
-                if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+        if (unlikely(!se->on_rq || curr == rq->idle))
-                        set_last_buddy(se);
+                return;
-        }
+        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                set_last_buddy(se);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1679,7 +1781,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
-        if (unlikely(!cfs_rq->nr_running))
+        if (!cfs_rq->nr_running)
                return NULL;
        do {
@@ -1714,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 */
 /*
- * Load-balancing iterator. Note: while the runqueue stays locked
+ * pull_task - move a task from a remote runqueue to the local runqueue.
- * during the whole iteration, the current task might be
+ * Both runqueues must be locked.
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
 */
-static struct task_struct *
+static void pull_task(struct rq *src_rq, struct task_struct *p,
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
+                      struct rq *this_rq, int this_cpu)
 {
-        struct task_struct *p = NULL;
+        deactivate_task(src_rq, p, 0);
-        struct sched_entity *se;
+        set_task_cpu(p, this_cpu);
+        activate_task(this_rq, p, 0);
+        check_preempt_curr(this_rq, p, 0);
+}
-        if (next == &cfs_rq->tasks)
+/*
-                return NULL;
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
+                     int *all_pinned)
+{
+        int tsk_cache_hot = 0;
+        /*
+         * We do not migrate tasks that are:
+         * 1) running (obviously), or
+         * 2) cannot be migrated to this CPU due to cpus_allowed, or
+         * 3) are cache-hot on their current CPU.
+         */
+        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+                schedstat_inc(p, se.nr_failed_migrations_affine);
+                return 0;
+        }
+        *all_pinned = 0;
-        se = list_entry(next, struct sched_entity, group_node);
+        if (task_running(rq, p)) {
-        p = task_of(se);
+                schedstat_inc(p, se.nr_failed_migrations_running);
-        cfs_rq->balance_iterator = next->next;
+                return 0;
+        }
-        return p;
+        /*
-}
+         * Aggressive migration if:
+         * 1) task is cache cold, or
+         * 2) too many balance attempts have failed.
+         */
-static struct task_struct *load_balance_start_fair(void *arg)
+        tsk_cache_hot = task_hot(p, rq->clock, sd);
-{
+        if (!tsk_cache_hot ||
-        struct cfs_rq *cfs_rq = arg;
+                sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(sd, lb_hot_gained[idle]);
+                        schedstat_inc(p, se.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
-        return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
+        if (tsk_cache_hot) {
+                schedstat_inc(p, se.nr_failed_migrations_hot);
+                return 0;
+        }
+        return 1;
 }
-static struct task_struct *load_balance_next_fair(void *arg)
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+              struct sched_domain *sd, enum cpu_idle_type idle)
 {
-        struct cfs_rq *cfs_rq = arg;
+        struct task_struct *p, *n;
+        struct cfs_rq *cfs_rq;
+        int pinned = 0;
-        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
+        for_each_leaf_cfs_rq(busiest, cfs_rq) {
+                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                        if (!can_migrate_task(p, busiest, this_cpu,
+                                                sd, idle, &pinned))
+                                continue;
+                        pull_task(busiest, p, this_rq, this_cpu);
+                        /*
+                         * Right now, this is only the second place pull_task()
+                         * is called, so we can safely collect pull_task()
+                         * stats here rather than inside pull_task().
+                         */
+                        schedstat_inc(sd, lb_gained[idle]);
+                        return 1;
+                }
+        }
+        return 0;
 }
 static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move, struct sched_domain *sd,
+              unsigned long max_load_move, struct sched_domain *sd,
-                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+              enum cpu_idle_type idle, int *all_pinned,
-                struct cfs_rq *cfs_rq)
+              int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-        struct rq_iterator cfs_rq_iterator;
+        int loops = 0, pulled = 0, pinned = 0;
+        long rem_load_move = max_load_move;
+        struct task_struct *p, *n;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        if (max_load_move == 0)
-        cfs_rq_iterator.next = load_balance_next_fair;
+                goto out;
-        cfs_rq_iterator.arg = cfs_rq;
-        return balance_tasks(this_rq, this_cpu, busiest,
+        pinned = 1;
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &cfs_rq_iterator);
+        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+                if (loops++ > sysctl_sched_nr_migrate)
+                        break;
+                if ((p->se.load.weight >> 1) > rem_load_move ||
+                    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+                        continue;
+                pull_task(busiest, p, this_rq, this_cpu);
+                pulled++;
+                rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+                /*
+                 * NEWIDLE balancing is a source of latency, so preemptible
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
+                 */
+                if (idle == CPU_NEWLY_IDLE)
+                        break;
+#endif
+                /*
+                 * We only want to steal up to the prescribed amount of
+                 * weighted load.
+                 */
+                if (rem_load_move <= 0)
+                        break;
+                if (p->prio < *this_best_prio)
+                        *this_best_prio = p->prio;
+        }
+out:
+        /*
+         * Right now, this is one of only two places pull_task() is called,
+         * so we can safely collect pull_task() stats here rather than
+         * inside pull_task().
+         */
+        schedstat_add(sd, lb_gained[idle], pulled);
+        if (all_pinned)
+                *all_pinned = pinned;
+        return max_load_move - rem_load_move;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1796,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                rem_load = (u64)rem_load_move * busiest_weight;
                rem_load = div_u64(rem_load, busiest_h_load + 1);
-                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                moved_load = balance_tasks(this_rq, this_cpu, busiest,
                                rem_load, sd, idle, all_pinned, this_best_prio,
-                                tg->cfs_rq[busiest_cpu]);
+                                busiest_cfs_rq);
                if (!moved_load)
                        continue;
@@ -1821,35 +2030,1529 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
-        return __load_balance_fair(this_rq, this_cpu, busiest,
+        return balance_tasks(this_rq, this_cpu, busiest,
                        max_load_move, sd, idle, all_pinned,
                        this_best_prio, &busiest->cfs);
 }
 #endif
-static int
+/*
-move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
-                   struct sched_domain *sd, enum cpu_idle_type idle)
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                      unsigned long max_load_move,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
+                      int *all_pinned)
 {
-        struct cfs_rq *busy_cfs_rq;
+        unsigned long total_load_moved = 0, load_moved;
-        struct rq_iterator cfs_rq_iterator;
+        int this_best_prio = this_rq->curr->prio;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        do {
-        cfs_rq_iterator.next = load_balance_next_fair;
+                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
+                                max_load_move - total_load_moved,
+                                sd, idle, all_pinned, &this_best_prio);
-        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+                total_load_moved += load_moved;
+#ifdef CONFIG_PREEMPT
                /*
-                 * pass busy_cfs_rq argument into
+                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * load_balance_[start|next]_fair iterators
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
                 */
-                cfs_rq_iterator.arg = busy_cfs_rq;
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+                        break;
-                                       &cfs_rq_iterator))
-                    return 1;
+                if (raw_spin_is_contended(&this_rq->lock) ||
+                                raw_spin_is_contended(&busiest->lock))
+                        break;
+#endif
+        } while (load_moved && max_load_move > total_load_moved);
+        return total_load_moved > 0;
+}
+/********** Helpers for find_busiest_group ************************/
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *              during load balancing.
+ */
+struct sd_lb_stats {
+        struct sched_group *busiest; /* Busiest group in this sd */
+        struct sched_group *this;  /* Local group in this sd */
+        unsigned long total_load;  /* Total load of all groups in sd */
+        unsigned long total_pwr;   /*   Total power of all groups in sd */
+        unsigned long avg_load;    /* Average load across all groups in sd */
+        /** Statistics of this group */
+        unsigned long this_load;
+        unsigned long this_load_per_task;
+        unsigned long this_nr_running;
+        /* Statistics of the busiest group */
+        unsigned long max_load;
+        unsigned long busiest_load_per_task;
+        unsigned long busiest_nr_running;
+        unsigned long busiest_group_capacity;
+        int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+        int power_savings_balance; /* Is powersave balance needed for this sd */
+        struct sched_group *group_min; /* Least loaded group in sd */
+        struct sched_group *group_leader; /* Group which relieves group_min */
+        unsigned long min_load_per_task; /* load_per_task in group_min */
+        unsigned long leader_nr_running; /* Nr running of group_leader */
+        unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+        unsigned long avg_load; /*Avg load across the CPUs of the group */
+        unsigned long group_load; /* Total load over the CPUs of the group */
+        unsigned long sum_nr_running; /* Nr tasks running in the group */
+        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+        unsigned long group_capacity;
+        int group_imb; /* Is there an imbalance in the group ? */
+};
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+        return cpumask_first(sched_group_cpus(group));
+}
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+                                        enum cpu_idle_type idle)
+{
+        int load_idx;
+        switch (idle) {
+        case CPU_NOT_IDLE:
+                load_idx = sd->busy_idx;
+                break;
+        case CPU_NEWLY_IDLE:
+                load_idx = sd->newidle_idx;
+                break;
+        default:
+                load_idx = sd->idle_idx;
+                break;
        }
+        return load_idx;
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        /*
+         * Busy processors will not participate in power savings
+         * balance.
+         */
+        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                sds->power_savings_balance = 0;
+        else {
+                sds->power_savings_balance = 1;
+                sds->min_nr_running = ULONG_MAX;
+                sds->leader_nr_running = 0;
+        }
+}
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *              load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        if (!sds->power_savings_balance)
+                return;
+        /*
+         * If the local group is idle or completely loaded
+         * no need to do power savings balance at this domain
+         */
+        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                                !sds->this_nr_running))
+                sds->power_savings_balance = 0;
+        /*
+         * If a group is already running at full capacity or idle,
+         * don't include that group in power savings calculations
+         */
+        if (!sds->power_savings_balance ||
+                sgs->sum_nr_running >= sgs->group_capacity ||
+                !sgs->sum_nr_running)
+                return;
+        /*
+         * Calculate the group which has the least non-idle load.
+         * This is the group from where we need to pick up the load
+         * for saving power
+         */
+        if ((sgs->sum_nr_running < sds->min_nr_running) ||
+            (sgs->sum_nr_running == sds->min_nr_running &&
+             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+                sds->group_min = group;
+                sds->min_nr_running = sgs->sum_nr_running;
+                sds->min_load_per_task = sgs->sum_weighted_load /
+                                                sgs->sum_nr_running;
+        }
+        /*
+         * Calculate the group which is almost near its
+         * capacity but still has some space to pick up some load
+         * from other group and save more power
+         */
+        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
+                return;
+        if (sgs->sum_nr_running > sds->leader_nr_running ||
+            (sgs->sum_nr_running == sds->leader_nr_running &&
+             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+                sds->group_leader = group;
+                sds->leader_nr_running = sgs->sum_nr_running;
+        }
+}
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *      under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
+        if (!sds->power_savings_balance)
+                return 0;
+        if (sds->this != sds->group_leader ||
+                        sds->group_leader == sds->group_min)
+                return 0;
+        *imbalance = sds->min_load_per_task;
+        sds->busiest = sds->group_min;
+        return 1;
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        return;
+}
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        return;
+}
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
        return 0;
 }
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return SCHED_LOAD_SCALE;
+}
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_freq_power(sd, cpu);
+}
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long smt_gain = sd->smt_gain;
+        smt_gain /= weight;
+        return smt_gain;
+}
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_smt_power(sd, cpu);
+}
+unsigned long scale_rt_power(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        u64 total, available;
+        sched_avg_update(rq);
+        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        available = total - rq->rt_avg;
+        if (unlikely((s64)total < SCHED_LOAD_SCALE))
+                total = SCHED_LOAD_SCALE;
+        total >>= SCHED_LOAD_SHIFT;
+        return div_u64(available, total);
+}
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long power = SCHED_LOAD_SCALE;
+        struct sched_group *sdg = sd->groups;
+        if (sched_feat(ARCH_POWER))
+                power *= arch_scale_freq_power(sd, cpu);
+        else
+                power *= default_scale_freq_power(sd, cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+                if (sched_feat(ARCH_POWER))
+                        power *= arch_scale_smt_power(sd, cpu);
+                else
+                        power *= default_scale_smt_power(sd, cpu);
+                power >>= SCHED_LOAD_SHIFT;
+        }
+        power *= scale_rt_power(cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if (!power)
+                power = 1;
+        sdg->cpu_power = power;
+}
+static void update_group_power(struct sched_domain *sd, int cpu)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group, *sdg = sd->groups;
+        unsigned long power;
+        if (!child) {
+                update_cpu_power(sd, cpu);
+                return;
+        }
+        power = 0;
+        group = child->groups;
+        do {
+                power += group->cpu_power;
+                group = group->next;
+        } while (group != child->groups);
+        sdg->cpu_power = power;
+}
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+                        struct sched_group *group, int this_cpu,
+                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                        int local_group, const struct cpumask *cpus,
+                        int *balance, struct sg_lb_stats *sgs)
+{
+        unsigned long load, max_cpu_load, min_cpu_load;
+        int i;
+        unsigned int balance_cpu = -1, first_idle_cpu = 0;
+        unsigned long avg_load_per_task = 0;
+        if (local_group)
+                balance_cpu = group_first_cpu(group);
+        /* Tally up the load of all CPUs in the group */
+        max_cpu_load = 0;
+        min_cpu_load = ~0UL;
+        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                struct rq *rq = cpu_rq(i);
+                if (*sd_idle && rq->nr_running)
+                        *sd_idle = 0;
+                /* Bias balancing toward cpus of our domain */
+                if (local_group) {
+                        if (idle_cpu(i) && !first_idle_cpu) {
+                                first_idle_cpu = 1;
+                                balance_cpu = i;
+                        }
+                        load = target_load(i, load_idx);
+                } else {
+                        load = source_load(i, load_idx);
+                        if (load > max_cpu_load)
+                                max_cpu_load = load;
+                        if (min_cpu_load > load)
+                                min_cpu_load = load;
+                }
+                sgs->group_load += load;
+                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_weighted_load += weighted_cpuload(i);
+        }
+        /*
+         * First idle cpu or the first cpu(busiest) in this sched group
+         * is eligible for doing load balancing at this and above
+         * domains. In the newly idle case, we will allow all the cpu's
+         * to do the newly idle load balance.
+         */
+        if (idle != CPU_NEWLY_IDLE && local_group &&
+            balance_cpu != this_cpu) {
+                *balance = 0;
+                return;
+        }
+        update_group_power(sd, this_cpu);
+        /* Adjust by relative CPU power of the group */
+        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+        /*
+         * Consider the group unbalanced when the imbalance is larger
+         * than the average weight of two tasks.
+         *
+         * APZ: with cgroup the avg task weight can vary wildly and
+         *      might not be a suitable number - should we keep a
+         *      normalized nr_running number somewhere that negates
+         *      the hierarchy?
+         */
+        if (sgs->sum_nr_running)
+                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+                sgs->group_imb = 1;
+        sgs->group_capacity =
+                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+}
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                        enum cpu_idle_type idle, int *sd_idle,
+                        const struct cpumask *cpus, int *balance,
+                        struct sd_lb_stats *sds)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group = sd->groups;
+        struct sg_lb_stats sgs;
+        int load_idx, prefer_sibling = 0;
+        if (child && child->flags & SD_PREFER_SIBLING)
+                prefer_sibling = 1;
+        init_sd_power_savings_stats(sd, sds, idle);
+        load_idx = get_sd_load_idx(sd, idle);
+        do {
+                int local_group;
+                local_group = cpumask_test_cpu(this_cpu,
+                                               sched_group_cpus(group));
+                memset(&sgs, 0, sizeof(sgs));
+                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+                                local_group, cpus, balance, &sgs);
+                if (local_group && !(*balance))
+                        return;
+                sds->total_load += sgs.group_load;
+                sds->total_pwr += group->cpu_power;
+                /*
+                 * In case the child domain prefers tasks go to siblings
+                 * first, lower the group capacity to one so that we'll try
+                 * and move all the excess tasks away.
+                 */
+                if (prefer_sibling)
+                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
+                if (local_group) {
+                        sds->this_load = sgs.avg_load;
+                        sds->this = group;
+                        sds->this_nr_running = sgs.sum_nr_running;
+                        sds->this_load_per_task = sgs.sum_weighted_load;
+                } else if (sgs.avg_load > sds->max_load &&
+                           (sgs.sum_nr_running > sgs.group_capacity ||
+                                sgs.group_imb)) {
+                        sds->max_load = sgs.avg_load;
+                        sds->busiest = group;
+                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_group_capacity = sgs.group_capacity;
+                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->group_imb = sgs.group_imb;
+                }
+                update_sd_power_savings_stats(group, sds, local_group, &sgs);
+                group = group->next;
+        } while (group != sd->groups);
+}
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *                      amongst the groups of a sched_domain, during
+ *                      load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                                int this_cpu, unsigned long *imbalance)
+{
+        unsigned long tmp, pwr_now = 0, pwr_move = 0;
+        unsigned int imbn = 2;
+        unsigned long scaled_busy_load_per_task;
+        if (sds->this_nr_running) {
+                sds->this_load_per_task /= sds->this_nr_running;
+                if (sds->busiest_load_per_task >
+                                sds->this_load_per_task)
+                        imbn = 1;
+        } else
+                sds->this_load_per_task =
+                        cpu_avg_load_per_task(this_cpu);
+        scaled_busy_load_per_task = sds->busiest_load_per_task
+                                                 * SCHED_LOAD_SCALE;
+        scaled_busy_load_per_task /= sds->busiest->cpu_power;
+        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+                        (scaled_busy_load_per_task * imbn)) {
+                *imbalance = sds->busiest_load_per_task;
+                return;
+        }
+        /*
+         * OK, we don't have enough imbalance to justify moving tasks,
+         * however we may be able to increase total CPU power used by
+         * moving them.
+         */
+        pwr_now += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load);
+        pwr_now += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load);
+        pwr_now /= SCHED_LOAD_SCALE;
+        /* Amount of load we'd subtract */
+        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                sds->busiest->cpu_power;
+        if (sds->max_load > tmp)
+                pwr_move += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load - tmp);
+        /* Amount of load we'd add */
+        if (sds->max_load * sds->busiest->cpu_power <
+                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+                tmp = (sds->max_load * sds->busiest->cpu_power) /
+                        sds->this->cpu_power;
+        else
+                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                        sds->this->cpu_power;
+        pwr_move += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load + tmp);
+        pwr_move /= SCHED_LOAD_SCALE;
+        /* Move if we gain throughput */
+        if (pwr_move > pwr_now)
+                *imbalance = sds->busiest_load_per_task;
+}
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                       groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+                unsigned long *imbalance)
+{
+        unsigned long max_pull, load_above_capacity = ~0UL;
+        sds->busiest_load_per_task /= sds->busiest_nr_running;
+        if (sds->group_imb) {
+                sds->busiest_load_per_task =
+                        min(sds->busiest_load_per_task, sds->avg_load);
+        }
+        /*
+         * In the presence of smp nice balancing, certain scenarios can have
+         * max load less than avg load(as we skip the groups at or below
+         * its cpu_power, while calculating max_load..)
+         */
+        if (sds->max_load < sds->avg_load) {
+                *imbalance = 0;
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+        }
+        if (!sds->group_imb) {
+                /*
+                 * Don't want to pull so many tasks that a group would go idle.
+                 */
+                load_above_capacity = (sds->busiest_nr_running -
+                                                sds->busiest_group_capacity);
+                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+                load_above_capacity /= sds->busiest->cpu_power;
+        }
+        /*
+         * We're trying to get all the cpus to the average_load, so we don't
+         * want to push ourselves above the average load, nor do we wish to
+         * reduce the max loaded cpu below the average load. At the same time,
+         * we also don't want to reduce the group load below the group capacity
+         * (so that we can implement power-savings policies etc). Thus we look
+         * for the minimum possible imbalance.
+         * Be careful of negative numbers as they'll appear as very large values
+         * with unsigned longs.
+         */
+        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+        /* How much load to actually move to equalise the imbalance */
+        *imbalance = min(max_pull * sds->busiest->cpu_power,
+                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+                        / SCHED_LOAD_SCALE;
+        /*
+         * if *imbalance is less than the average load per runnable task
+         * there is no gaurantee that any tasks will be moved so we'll have
+         * a think about bumping its value to force at least one task to be
+         * moved
+         */
+        if (*imbalance < sds->busiest_load_per_task)
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+}
+/******* find_busiest_group() helpers end here *********************/
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *              be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *      is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:     - the busiest group if imbalance exists.
+ *              - If no imbalance and user has opted for power-savings balance,
+ *                 return the least loaded group whose CPUs can be
+ *                 put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                   unsigned long *imbalance, enum cpu_idle_type idle,
+                   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+        struct sd_lb_stats sds;
+        memset(&sds, 0, sizeof(sds));
+        /*
+         * Compute the various statistics relavent for load balancing at
+         * this level.
+         */
+        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                        balance, &sds);
+        /* Cases where imbalance does not exist from POV of this_cpu */
+        /* 1) this_cpu is not the appropriate cpu to perform load balancing
+         *    at this level.
+         * 2) There is no busy sibling group to pull from.
+         * 3) This group is the busiest group.
+         * 4) This group is more busy than the avg busieness at this
+         *    sched_domain.
+         * 5) The imbalance is within the specified limit.
+         */
+        if (!(*balance))
+                goto ret;
+        if (!sds.busiest || sds.busiest_nr_running == 0)
+                goto out_balanced;
+        if (sds.this_load >= sds.max_load)
+                goto out_balanced;
+        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        if (sds.this_load >= sds.avg_load)
+                goto out_balanced;
+        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                goto out_balanced;
+        /* Looks like there is an imbalance. Compute it */
+        calculate_imbalance(&sds, this_cpu, imbalance);
+        return sds.busiest;
+out_balanced:
+        /*
+         * There is no obvious imbalance. But check if we can do some balancing
+         * to save power.
+         */
+        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+                return sds.busiest;
+ret:
+        *imbalance = 0;
+        return NULL;
+}
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
+                   unsigned long imbalance, const struct cpumask *cpus)
+{
+        struct rq *busiest = NULL, *rq;
+        unsigned long max_load = 0;
+        int i;
+        for_each_cpu(i, sched_group_cpus(group)) {
+                unsigned long power = power_of(i);
+                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                unsigned long wl;
+                if (!cpumask_test_cpu(i, cpus))
+                        continue;
+                rq = cpu_rq(i);
+                wl = weighted_cpuload(i);
+                /*
+                 * When comparing with imbalance, use weighted_cpuload()
+                 * which is not scaled with the cpu power.
+                 */
+                if (capacity && rq->nr_running == 1 && wl > imbalance)
+                        continue;
+                /*
+                 * For the load comparisons with the other cpu's, consider
+                 * the weighted_cpuload() scaled with the cpu power, so that
+                 * the load can be moved away from the cpu that is potentially
+                 * running at a lower capacity.
+                 */
+                wl = (wl * SCHED_LOAD_SCALE) / power;
+                if (wl > max_load) {
+                        max_load = wl;
+                        busiest = rq;
+                }
+        }
+        return busiest;
+}
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL     512
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+{
+        if (idle == CPU_NEWLY_IDLE) {
+                /*
+                 * The only task running in a non-idle cpu can be moved to this
+                 * cpu in an attempt to completely freeup the other CPU
+                 * package.
+                 *
+                 * The package power saving logic comes from
+                 * find_busiest_group(). If there are no imbalance, then
+                 * f_b_g() will return NULL. However when sched_mc={1,2} then
+                 * f_b_g() will select a group from which a running task may be
+                 * pulled to this cpu in order to make the other package idle.
+                 * If there is no opportunity to make a package idle and if
+                 * there are no imbalance, then f_b_g() will return NULL and no
+                 * action will be taken in load_balance_newidle().
+                 *
+                 * Under normal task pull operation due to imbalance, there
+                 * will be more than one task in the source run queue and
+                 * move_tasks() will succeed.  ld_moved will be true and this
+                 * active balance code will not be triggered.
+                 */
+                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                        return 0;
+                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                        return 0;
+        }
+        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *balance)
+{
+        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+        struct sched_group *group;
+        unsigned long imbalance;
+        struct rq *busiest;
+        unsigned long flags;
+        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+        cpumask_copy(cpus, cpu_active_mask);
+        /*
+         * When power savings policy is enabled for the parent domain, idle
+         * sibling can pick up load irrespective of busy siblings. In this case,
+         * let the state of idle sibling percolate up as CPU_IDLE, instead of
+         * portraying it as CPU_NOT_IDLE.
+         */
+        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                sd_idle = 1;
+        schedstat_inc(sd, lb_count[idle]);
+redo:
+        update_shares(sd);
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+                                   cpus, balance);
+        if (*balance == 0)
+                goto out_balanced;
+        if (!group) {
+                schedstat_inc(sd, lb_nobusyg[idle]);
+                goto out_balanced;
+        }
+        busiest = find_busiest_queue(group, idle, imbalance, cpus);
+        if (!busiest) {
+                schedstat_inc(sd, lb_nobusyq[idle]);
+                goto out_balanced;
+        }
+        BUG_ON(busiest == this_rq);
+        schedstat_add(sd, lb_imbalance[idle], imbalance);
+        ld_moved = 0;
+        if (busiest->nr_running > 1) {
+                /*
+                 * Attempt to move tasks. If find_busiest_group has found
+                 * an imbalance but busiest->nr_running <= 1, the group is
+                 * still unbalanced. ld_moved simply stays zero, so it is
+                 * correctly treated as an imbalance.
+                 */
+                local_irq_save(flags);
+                double_rq_lock(this_rq, busiest);
+                ld_moved = move_tasks(this_rq, this_cpu, busiest,
+                                      imbalance, sd, idle, &all_pinned);
+                double_rq_unlock(this_rq, busiest);
+                local_irq_restore(flags);
+                /*
+                 * some other cpu did the load balance for us.
+                 */
+                if (ld_moved && this_cpu != smp_processor_id())
+                        resched_cpu(this_cpu);
+                /* All tasks on this runqueue were pinned by CPU affinity */
+                if (unlikely(all_pinned)) {
+                        cpumask_clear_cpu(cpu_of(busiest), cpus);
+                        if (!cpumask_empty(cpus))
+                                goto redo;
+                        goto out_balanced;
+                }
+        }
+        if (!ld_moved) {
+                schedstat_inc(sd, lb_failed[idle]);
+                sd->nr_balance_failed++;
+                if (need_active_balance(sd, sd_idle, idle)) {
+                        raw_spin_lock_irqsave(&busiest->lock, flags);
+                        /* don't kick the migration_thread, if the curr
+                         * task on busiest cpu can't be moved to this_cpu
+                         */
+                        if (!cpumask_test_cpu(this_cpu,
+                                              &busiest->curr->cpus_allowed)) {
+                                raw_spin_unlock_irqrestore(&busiest->lock,
+                                                            flags);
+                                all_pinned = 1;
+                                goto out_one_pinned;
+                        }
+                        if (!busiest->active_balance) {
+                                busiest->active_balance = 1;
+                                busiest->push_cpu = this_cpu;
+                                active_balance = 1;
+                        }
+                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
+                        if (active_balance)
+                                wake_up_process(busiest->migration_thread);
+                        /*
+                         * We've kicked active balancing, reset the failure
+                         * counter.
+                         */
+                        sd->nr_balance_failed = sd->cache_nice_tries+1;
+                }
+        } else
+                sd->nr_balance_failed = 0;
+        if (likely(!active_balance)) {
+                /* We were unbalanced, so reset the balancing interval */
+                sd->balance_interval = sd->min_interval;
+        } else {
+                /*
+                 * If we've begun active balancing, start to back off. This
+                 * case may not be covered by the all_pinned logic if there
+                 * is only 1 task on the busy runqueue (because we don't call
+                 * move_tasks).
+                 */
+                if (sd->balance_interval < sd->max_interval)
+                        sd->balance_interval *= 2;
+        }
+        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        goto out;
+out_balanced:
+        schedstat_inc(sd, lb_balanced[idle]);
+        sd->nr_balance_failed = 0;
+out_one_pinned:
+        /* tune up the balancing interval */
+        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                        (sd->balance_interval < sd->max_interval))
+                sd->balance_interval *= 2;
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        else
+                ld_moved = 0;
+out:
+        if (ld_moved)
+                update_shares(sd);
+        return ld_moved;
+}
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, struct rq *this_rq)
+{
+        struct sched_domain *sd;
+        int pulled_task = 0;
+        unsigned long next_balance = jiffies + HZ;
+        this_rq->idle_stamp = this_rq->clock;
+        if (this_rq->avg_idle < sysctl_sched_migration_cost)
+                return;
+        /*
+         * Drop the rq->lock, but keep IRQ/preempt disabled.
+         */
+        raw_spin_unlock(&this_rq->lock);
+        for_each_domain(this_cpu, sd) {
+                unsigned long interval;
+                int balance = 1;
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                        /* If we've pulled tasks over stop searching: */
+                        pulled_task = load_balance(this_cpu, this_rq,
+                                                   sd, CPU_NEWLY_IDLE, &balance);
+                }
+                interval = msecs_to_jiffies(sd->balance_interval);
+                if (time_after(next_balance, sd->last_balance + interval))
+                        next_balance = sd->last_balance + interval;
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
+                        break;
+                }
+        }
+        raw_spin_lock(&this_rq->lock);
+        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+                /*
+                 * We are going idle. next_balance may be set based on
+                 * a busy processor. So reset next_balance.
+                 */
+                this_rq->next_balance = next_balance;
+        }
+}
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+{
+        int target_cpu = busiest_rq->push_cpu;
+        struct sched_domain *sd;
+        struct rq *target_rq;
+        /* Is there any task to move? */
+        if (busiest_rq->nr_running <= 1)
+                return;
+        target_rq = cpu_rq(target_cpu);
+        /*
+         * This condition is "impossible", if it occurs
+         * we need to fix it. Originally reported by
+         * Bjorn Helgaas on a 128-cpu setup.
+         */
+        BUG_ON(busiest_rq == target_rq);
+        /* move a task from busiest_rq to target_rq */
+        double_lock_balance(busiest_rq, target_rq);
+        update_rq_clock(busiest_rq);
+        update_rq_clock(target_rq);
+        /* Search for an sd spanning us and the target CPU. */
+        for_each_domain(target_cpu, sd) {
+                if ((sd->flags & SD_LOAD_BALANCE) &&
+                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                                break;
+        }
+        if (likely(sd)) {
+                schedstat_inc(sd, alb_count);
+                if (move_one_task(target_rq, target_cpu, busiest_rq,
+                                  sd, CPU_IDLE))
+                        schedstat_inc(sd, alb_pushed);
+                else
+                        schedstat_inc(sd, alb_failed);
+        }
+        double_unlock_balance(busiest_rq, target_rq);
+}
+#ifdef CONFIG_NO_HZ
+static struct {
+        atomic_t load_balancer;
+        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
+} nohz ____cacheline_aligned = {
+        .load_balancer = ATOMIC_INIT(-1),
+};
+int get_nohz_load_balancer(void)
+{
+        return atomic_read(&nohz.load_balancer);
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
+        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
+/*
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
+ *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+        int cpu = smp_processor_id();
+        if (stop_tick) {
+                cpu_rq(cpu)->in_nohz_recently = 1;
+                if (!cpu_active(cpu)) {
+                        if (atomic_read(&nohz.load_balancer) != cpu)
+                                return 0;
+                        /*
+                         * If we are going offline and still the leader,
+                         * give up!
+                         */
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+                        return 0;
+                }
+                cpumask_set_cpu(cpu, nohz.cpu_mask);
+                /* time for ilb owner also to sleep */
+                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
+                        if (atomic_read(&nohz.load_balancer) == cpu)
+                                atomic_set(&nohz.load_balancer, -1);
+                        return 0;
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        /* make me the ilb owner */
+                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+                                return 1;
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
+                        return 1;
+                }
+        } else {
+                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+                        return 0;
+                cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                if (atomic_read(&nohz.load_balancer) == cpu)
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+        }
+        return 0;
+}
+#endif
+static DEFINE_SPINLOCK(balancing);
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+{
+        int balance = 1;
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long interval;
+        struct sched_domain *sd;
+        /* Earliest time when we have to do rebalance again */
+        unsigned long next_balance = jiffies + 60*HZ;
+        int update_next_balance = 0;
+        int need_serialize;
+        for_each_domain(cpu, sd) {
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                interval = sd->balance_interval;
+                if (idle != CPU_IDLE)
+                        interval *= sd->busy_factor;
+                /* scale ms to jiffies */
+                interval = msecs_to_jiffies(interval);
+                if (unlikely(!interval))
+                        interval = 1;
+                if (interval > HZ*NR_CPUS/10)
+                        interval = HZ*NR_CPUS/10;
+                need_serialize = sd->flags & SD_SERIALIZE;
+                if (need_serialize) {
+                        if (!spin_trylock(&balancing))
+                                goto out;
+                }
+                if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                        if (load_balance(cpu, rq, sd, idle, &balance)) {
+                                /*
+                                 * We've pulled tasks over so either we're no
+                                 * longer idle, or one of our SMT siblings is
+                                 * not idle.
+                                 */
+                                idle = CPU_NOT_IDLE;
+                        }
+                        sd->last_balance = jiffies;
+                }
+                if (need_serialize)
+                        spin_unlock(&balancing);
+out:
+                if (time_after(next_balance, sd->last_balance + interval)) {
+                        next_balance = sd->last_balance + interval;
+                        update_next_balance = 1;
+                }
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!balance)
+                        break;
+        }
+        /*
+         * next_balance will be updated only when there is a need.
+         * When the cpu is attached to null domain for ex, it will not be
+         * updated.
+         */
+        if (likely(update_next_balance))
+                rq->next_balance = next_balance;
+}
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+        int this_cpu = smp_processor_id();
+        struct rq *this_rq = cpu_rq(this_cpu);
+        enum cpu_idle_type idle = this_rq->idle_at_tick ?
+                                                CPU_IDLE : CPU_NOT_IDLE;
+        rebalance_domains(this_cpu, idle);
+#ifdef CONFIG_NO_HZ
+        /*
+         * If this cpu is the owner for idle load balancing, then do the
+         * balancing on behalf of the other idle cpus whose ticks are
+         * stopped.
+         */
+        if (this_rq->idle_at_tick &&
+            atomic_read(&nohz.load_balancer) == this_cpu) {
+                struct rq *rq;
+                int balance_cpu;
+                for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                        if (balance_cpu == this_cpu)
+                                continue;
+                        /*
+                         * If this cpu gets work to do, stop the load balancing
+                         * work being done for other cpus. Next load
+                         * balancing owner will pick it up.
+                         */
+                        if (need_resched())
+                                break;
+                        rebalance_domains(balance_cpu, CPU_IDLE);
+                        rq = cpu_rq(balance_cpu);
+                        if (time_after(this_rq->next_balance, rq->next_balance))
+                                this_rq->next_balance = rq->next_balance;
+                }
+        }
+#endif
+}
+static inline int on_null_domain(int cpu)
+{
+        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+}
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(struct rq *rq, int cpu)
+{
+#ifdef CONFIG_NO_HZ
+        /*
+         * If we were in the nohz mode recently and busy at the current
+         * scheduler tick, then check if we need to nominate new idle
+         * load balancer.
+         */
+        if (rq->in_nohz_recently && !rq->idle_at_tick) {
+                rq->in_nohz_recently = 0;
+                if (atomic_read(&nohz.load_balancer) == cpu) {
+                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                        atomic_set(&nohz.load_balancer, -1);
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        int ilb = find_new_ilb(cpu);
+                        if (ilb < nr_cpu_ids)
+                                resched_cpu(ilb);
+                }
+        }
+        /*
+         * If this cpu is idle and doing idle load balancing for all the
+         * cpus with ticks stopped, is it time for that to stop?
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+                resched_cpu(cpu);
+                return;
+        }
+        /*
+         * If this cpu is idle and the idle load balancing is done by
+         * someone else, then no need raise the SCHED_SOFTIRQ
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+            cpumask_test_cpu(cpu, nohz.cpu_mask))
+                return;
+#endif
+        /* Don't need to rebalance while attached to NULL domain */
+        if (time_after_eq(jiffies, rq->next_balance) &&
+            likely(!on_null_domain(cpu)))
+                raise_softirq(SCHED_SOFTIRQ);
+}
+static void rq_online_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+static void rq_offline_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+#else   /* CONFIG_SMP */
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 /*
@@ -1867,28 +3570,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 /*
- * Share the fairness runtime between parent and child, thus the
+ * called on fork with the child task as argument from the parent's context
- * total amount of pressure for CPU stays equal - new tasks
+ *  - child not yet on the tasklist
- * get a chance to run but frequent forkers are not allowed to
+ *  - preemption disabled
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct cfs_rq *cfs_rq = task_cfs_rq(current);
        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
        int this_cpu = smp_processor_id();
+        struct rq *rq = this_rq();
+        unsigned long flags;
-        sched_info_queued(p);
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        if (unlikely(task_cpu(p) != this_cpu))
+                __set_task_cpu(p, this_cpu);
        update_curr(cfs_rq);
        if (curr)
                se->vruntime = curr->vruntime;
        place_entity(cfs_rq, se, 1);
-        /* 'curr' will be NULL if the child belongs to a different group */
+        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
@@ -1897,7 +3602,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
        }
-        enqueue_task_fair(rq, p, 0);
+        se->vruntime -= cfs_rq->min_vruntime;
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -1950,30 +3657,27 @@ static void set_curr_task_fair(struct rq *rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p)
+static void moved_group_fair(struct task_struct *p, int on_rq)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
        update_curr(cfs_rq);
-        place_entity(cfs_rq, &p->se, 1);
+        if (!on_rq)
+                place_entity(cfs_rq, &p->se, 1);
 }
 #endif
-unsigned int get_rr_interval_fair(struct task_struct *task)
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
-        unsigned long flags;
-        struct rq *rq;
        unsigned int rr_interval = 0;
        /*
         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
         * idle runqueue:
         */
-        rq = task_rq_lock(task, &flags);
        if (rq->cfs.load.weight)
                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        task_rq_unlock(rq, &flags);
        return rr_interval;
 }
@@ -1995,13 +3699,15 @@ static const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
-        .load_balance           = load_balance_fair,
+        .rq_online              = rq_online_fair,
-        .move_one_task          = move_one_task_fair,
+        .rq_offline             = rq_offline_fair,
+        .task_waking            = task_waking_fair,
 #endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
-        .task_new               = task_new_fair,
+        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 /*
- * Wakeup preemption towards tasks that run short
- */
-SCHED_FEAT(WAKEUP_RUNNING, 0)
-/*
 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
 * the remote end is likely to consume the data we just wrote, and
 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,34 +34,16 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
 static void
 dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
 {
-        spin_unlock_irq(&rq->lock);
+        raw_spin_unlock_irq(&rq->lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
        dump_stack();
-        spin_lock_irq(&rq->lock);
+        raw_spin_lock_irq(&rq->lock);
 }
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
-#ifdef CONFIG_SMP
-static unsigned long
-load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
-{
-        return 0;
-}
-static int
-move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        return 0;
-}
-#endif
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                check_preempt_curr(rq, p, 0);
 }
-unsigned int get_rr_interval_idle(struct task_struct *task)
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
        return 0;
 }
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
-        .load_balance           = load_balance_idle,
-        .move_one_task          = move_one_task_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f622880e918f..c2fbb02c1b54 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return rt_se->my_q;
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        int this_cpu = smp_processor_id();
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
-                        enqueue_rt_entity(rt_se);
+                        enqueue_rt_entity(rt_se, false);
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        int this_cpu = smp_processor_id();
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_se && on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
@@ -327,7 +333,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
        weight = cpumask_weight(rd->span);
-        spin_lock(&rt_b->rt_runtime_lock);
+        raw_spin_lock(&rt_b->rt_runtime_lock);
        rt_period = ktime_to_ns(rt_b->rt_period);
        for_each_cpu(i, rd->span) {
                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +342,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                if (iter == rt_rq)
                        continue;
-                spin_lock(&iter->rt_runtime_lock);
+                raw_spin_lock(&iter->rt_runtime_lock);
                /*
                 * Either all rqs have inf runtime and there's nothing to steal
                 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +364,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                        rt_rq->rt_runtime += diff;
                        more = 1;
                        if (rt_rq->rt_runtime == rt_period) {
-                                spin_unlock(&iter->rt_runtime_lock);
+                                raw_spin_unlock(&iter->rt_runtime_lock);
                                break;
                        }
                }
 next:
-                spin_unlock(&iter->rt_runtime_lock);
+                raw_spin_unlock(&iter->rt_runtime_lock);
        }
-        spin_unlock(&rt_b->rt_runtime_lock);
+        raw_spin_unlock(&rt_b->rt_runtime_lock);
        return more;
 }
@@ -386,8 +392,8 @@ static void __disable_runtime(struct rq *rq)
                s64 want;
                int i;
-                spin_lock(&rt_b->rt_runtime_lock);
+                raw_spin_lock(&rt_b->rt_runtime_lock);
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                /*
                 * Either we're all inf and nobody needs to borrow, or we're
                 * already disabled and thus have nothing to do, or we have
@@ -396,7 +402,7 @@ static void __disable_runtime(struct rq *rq)
                if (rt_rq->rt_runtime == RUNTIME_INF ||
                                rt_rq->rt_runtime == rt_b->rt_runtime)
                        goto balanced;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                /*
                 * Calculate the difference between what we started out with
@@ -418,7 +424,7 @@ static void __disable_runtime(struct rq *rq)
                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
                                continue;
-                        spin_lock(&iter->rt_runtime_lock);
+                        raw_spin_lock(&iter->rt_runtime_lock);
                        if (want > 0) {
                                diff = min_t(s64, iter->rt_runtime, want);
                                iter->rt_runtime -= diff;
@@ -427,13 +433,13 @@ static void __disable_runtime(struct rq *rq)
                                iter->rt_runtime -= want;
                                want -= want;
                        }
-                        spin_unlock(&iter->rt_runtime_lock);
+                        raw_spin_unlock(&iter->rt_runtime_lock);
                        if (!want)
                                break;
                }
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                /*
                 * We cannot be left wanting - that would mean some runtime
                 * leaked out of the system.
@@ -445,8 +451,8 @@ balanced:
                 * runtime - in which case borrowing doesn't make sense.
                 */
                rt_rq->rt_runtime = RUNTIME_INF;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                spin_unlock(&rt_b->rt_runtime_lock);
+                raw_spin_unlock(&rt_b->rt_runtime_lock);
        }
 }
@@ -454,9 +460,9 @@ static void disable_runtime(struct rq *rq)
 {
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __disable_runtime(rq);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 static void __enable_runtime(struct rq *rq)
@@ -472,13 +478,13 @@ static void __enable_runtime(struct rq *rq)
        for_each_leaf_rt_rq(rt_rq, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-                spin_lock(&rt_b->rt_runtime_lock);
+                raw_spin_lock(&rt_b->rt_runtime_lock);
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = rt_b->rt_runtime;
                rt_rq->rt_time = 0;
                rt_rq->rt_throttled = 0;
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
-                spin_unlock(&rt_b->rt_runtime_lock);
+                raw_spin_unlock(&rt_b->rt_runtime_lock);
        }
 }
@@ -486,9 +492,9 @@ static void enable_runtime(struct rq *rq)
 {
        unsigned long flags;
-        spin_lock_irqsave(&rq->lock, flags);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        __enable_runtime(rq);
-        spin_unlock_irqrestore(&rq->lock, flags);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +502,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
        int more = 0;
        if (rt_rq->rt_time > rt_rq->rt_runtime) {
-                spin_unlock(&rt_rq->rt_runtime_lock);
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                more = do_balance_runtime(rt_rq);
-                spin_lock(&rt_rq->rt_runtime_lock);
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
        }
        return more;
@@ -524,11 +530,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
                struct rq *rq = rq_of_rt_rq(rt_rq);
-                spin_lock(&rq->lock);
+                raw_spin_lock(&rq->lock);
                if (rt_rq->rt_time) {
                        u64 runtime;
-                        spin_lock(&rt_rq->rt_runtime_lock);
+                        raw_spin_lock(&rt_rq->rt_runtime_lock);
                        if (rt_rq->rt_throttled)
                                balance_runtime(rt_rq);
                        runtime = rt_rq->rt_runtime;
@@ -539,13 +545,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                        }
                        if (rt_rq->rt_time || rt_rq->rt_nr_running)
                                idle = 0;
-                        spin_unlock(&rt_rq->rt_runtime_lock);
+                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
                } else if (rt_rq->rt_nr_running)
                        idle = 0;
                if (enqueue)
                        sched_rt_rq_enqueue(rt_rq);
-                spin_unlock(&rq->lock);
+                raw_spin_unlock(&rq->lock);
        }
        return idle;
@@ -624,11 +630,11 @@ static void update_curr_rt(struct rq *rq)
                rt_rq = rt_rq_of_se(rt_se);
                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
-                        spin_lock(&rt_rq->rt_runtime_lock);
+                        raw_spin_lock(&rt_rq->rt_runtime_lock);
                        rt_rq->rt_time += delta_exec;
                        if (sched_rt_runtime_exceeded(rt_rq))
                                resched_task(curr);
-                        spin_unlock(&rt_rq->rt_runtime_lock);
+                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
                }
        }
 }
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        dec_rt_group(rt_se, rt_rq);
 }
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        list_add_tail(&rt_se->run_list, queue);
+        if (head)
+                list_add(&rt_se->run_list, queue);
+        else
+                list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
        }
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
-                __enqueue_rt_entity(rt_se);
+                __enqueue_rt_entity(rt_se, head);
 }
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                struct rt_rq *rt_rq = group_rt_rq(rt_se);
                if (rt_rq && rt_rq->rt_nr_running)
-                        __enqueue_rt_entity(rt_se);
+                        __enqueue_rt_entity(rt_se, false);
        }
 }
 /*
 * Adding/removing a task to/from a priority array:
 */
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        struct sched_rt_entity *rt_se = &p->rt;
        if (wakeup)
                rt_se->timeout = 0;
-        enqueue_rt_entity(rt_se);
+        enqueue_rt_entity(rt_se, head);
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
@@ -1136,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
                if (next && next->prio < idx)
                        continue;
                list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                        struct task_struct *p = rt_task_of(rt_se);
+                        struct task_struct *p;
+                        if (!rt_entity_is_task(rt_se))
+                                continue;
+                        p = rt_task_of(rt_se);
                        if (pick_rt_task(rq, p, cpu)) {
                                next = p;
                                break;
@@ -1153,29 +1168,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
-static inline int pick_optimal_cpu(int this_cpu,
-                                   const struct cpumask *mask)
-{
-        int first;
-        /* "this_cpu" is cheaper to preempt than a remote processor */
-        if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
-                return this_cpu;
-        first = cpumask_first(mask);
-        if (first < nr_cpu_ids)
-                return first;
-        return -1;
-}
 static int find_lowest_rq(struct task_struct *task)
 {
        struct sched_domain *sd;
        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
-        cpumask_var_t domain_mask;
        if (task->rt.nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
@@ -1198,28 +1196,26 @@ static int find_lowest_rq(struct task_struct *task)
         * Otherwise, we consult the sched_domains span maps to figure
         * out which cpu is logically closest to our hot cache data.
         */
-        if (this_cpu == cpu)
+        if (!cpumask_test_cpu(this_cpu, lowest_mask))
-                this_cpu = -1; /* Skip this_cpu opt if the same */
+                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
-        if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
-                for_each_domain(cpu, sd) {
-                        if (sd->flags & SD_WAKE_AFFINE) {
-                                int best_cpu;
-                                cpumask_and(domain_mask,
-                                            sched_domain_span(sd),
-                                            lowest_mask);
-                                best_cpu = pick_optimal_cpu(this_cpu,
+        for_each_domain(cpu, sd) {
-                                                            domain_mask);
+                if (sd->flags & SD_WAKE_AFFINE) {
+                        int best_cpu;
-                                if (best_cpu != -1) {
+                        /*
-                                        free_cpumask_var(domain_mask);
+                         * "this_cpu" is cheaper to preempt than a
-                                        return best_cpu;
+                         * remote processor.
-                                }
+                         */
-                        }
+                        if (this_cpu != -1 &&
+                            cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+                                return this_cpu;
+                        best_cpu = cpumask_first_and(lowest_mask,
+                                                     sched_domain_span(sd));
+                        if (best_cpu < nr_cpu_ids)
+                                return best_cpu;
                }
-                free_cpumask_var(domain_mask);
        }
        /*
@@ -1227,7 +1223,13 @@ static int find_lowest_rq(struct task_struct *task)
         * just give the caller *something* to work with from the compatible
         * locations.
         */
-        return pick_optimal_cpu(this_cpu, lowest_mask);
+        if (this_cpu != -1)
+                return this_cpu;
+        cpu = cpumask_any(lowest_mask);
+        if (cpu < nr_cpu_ids)
+                return cpu;
+        return -1;
 }
 /* Will lock the rq it finds */
@@ -1259,7 +1261,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     task_running(rq, task) ||
                                     !task->se.on_rq)) {
-                                spin_unlock(&lowest_rq->lock);
+                                raw_spin_unlock(&lowest_rq->lock);
                                lowest_rq = NULL;
                                break;
                        }
@@ -1485,7 +1487,7 @@ static void post_schedule_rt(struct rq *rq)
 * If we are not running and we are not going to reschedule soon, we should
 * try to push tasks away now
 */
-static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
+static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
@@ -1494,24 +1496,6 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
                push_rt_tasks(rq);
 }
-static unsigned long
-load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move,
-                struct sched_domain *sd, enum cpu_idle_type idle,
-                int *all_pinned, int *this_best_prio)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
-static int
-move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                 struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
 static void set_cpus_allowed_rt(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
@@ -1683,8 +1667,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
        if (!p->signal)
                return;
-        soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+        /* max may change after cur was read, this will be fixed next tick */
-        hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+        soft = task_rlimit(p, RLIMIT_RTTIME);
+        hard = task_rlimit_max(p, RLIMIT_RTTIME);
        if (soft != RLIM_INFINITY) {
                unsigned long next;
@@ -1734,7 +1719,7 @@ static void set_curr_task_rt(struct rq *rq)
        dequeue_pushable_task(rq, p);
 }
-unsigned int get_rr_interval_rt(struct task_struct *task)
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
        /*
         * Time slice is 0 for SCHED_FIFO tasks
@@ -1759,14 +1744,12 @@ static const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_rt,
-        .load_balance           = load_balance_rt,
-        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
-        .task_wake_up           = task_wake_up_rt,
+        .task_woken             = task_woken_rt,
        .switched_from          = switched_from_rt,
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 6705320784fd..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,12 +22,14 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/ratelimit.h>
 #include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/signal.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
@@ -41,6 +43,8 @@
 static struct kmem_cache *sigqueue_cachep;
+int print_fatal_signals __read_mostly;
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
        return t->sighand->action[sig - 1].sa.sa_handler;
@@ -155,62 +159,98 @@ void recalc_sigpending(void)
 /* Given the mask, find the first available signal that should be serviced. */
+#define SYNCHRONOUS_MASK \
+        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
+         sigmask(SIGTRAP) | sigmask(SIGFPE))
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
        unsigned long i, *s, *m, x;
        int sig = 0;
-        
        s = pending->signal.sig;
        m = mask->sig;
+        /*
+         * Handle the first word specially: it contains the
+         * synchronous signals that need to be dequeued first.
+         */
+        x = *s &~ *m;
+        if (x) {
+                if (x & SYNCHRONOUS_MASK)
+                        x &= SYNCHRONOUS_MASK;
+                sig = ffz(~x) + 1;
+                return sig;
+        }
        switch (_NSIG_WORDS) {
        default:
-                for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
+                for (i = 1; i < _NSIG_WORDS; ++i) {
-                        if ((x = *s &~ *m) != 0) {
+                        x = *++s &~ *++m;
-                                sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        if (!x)
-                                break;
+                                continue;
-                        }
+                        sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        break;
+                }
                break;
-        case 2: if ((x = s[0] &~ m[0]) != 0)
+        case 2:
-                        sig = 1;
+                x = s[1] &~ m[1];
-                else if ((x = s[1] &~ m[1]) != 0)
+                if (!x)
-                        sig = _NSIG_BPW + 1;
-                else
                        break;
-                sig += ffz(~x);
+                sig = ffz(~x) + _NSIG_BPW + 1;
                break;
-        case 1: if ((x = *s &~ *m) != 0)
+        case 1:
-                        sig = ffz(~x) + 1;
+                /* Nothing to do */
                break;
        }
-        
        return sig;
 }
+static inline void print_dropped_signal(int sig)
+{
+        static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+        if (!print_fatal_signals)
+                return;
+        if (!__ratelimit(&ratelimit_state))
+                return;
+        printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
+                                current->comm, current->pid, sig);
+}
 /*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
 *   appopriate lock must be held to stop the target task from exiting
 */
-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
+static struct sigqueue *
-                                         int override_rlimit)
+__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
 {
        struct sigqueue *q = NULL;
        struct user_struct *user;
        /*
-         * We won't get problems with the target's UID changing under us
+         * Protect access to @t credentials. This can go away when all
-         * because changing it requires RCU be used, and if t != current, the
+         * callers hold rcu read lock.
-         * caller must be holding the RCU readlock (by way of a spinlock) and
-         * we use RCU protection here
         */
+        rcu_read_lock();
        user = get_uid(__task_cred(t)->user);
        atomic_inc(&user->sigpending);
+        rcu_read_unlock();
        if (override_rlimit ||
            atomic_read(&user->sigpending) <=
-                        t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
+                        task_rlimit(t, RLIMIT_SIGPENDING)) {
                q = kmem_cache_alloc(sigqueue_cachep, flags);
+        } else {
+                print_dropped_signal(sig);
+        }
        if (unlikely(q == NULL)) {
                atomic_dec(&user->sigpending);
                free_uid(user);
@@ -400,7 +440,7 @@ still_pending:
                 */
                info->si_signo = sig;
                info->si_errno = 0;
-                info->si_code = 0;
+                info->si_code = SI_USER;
                info->si_pid = 0;
                info->si_uid = 0;
        }
@@ -584,6 +624,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
        return 1;
 }
+static inline int is_si_special(const struct siginfo *info)
+{
+        return info <= SEND_SIG_FORCED;
+}
+static inline bool si_fromuser(const struct siginfo *info)
+{
+        return info == SEND_SIG_NOINFO ||
+                (!is_si_special(info) && SI_FROMUSER(info));
+}
 /*
 * Bad permissions for sending the signal
 * - the caller must hold at least the RCU read lock
@@ -598,7 +649,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
        if (!valid_signal(sig))
                return -EINVAL;
-        if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
+        if (!si_fromuser(info))
                return 0;
        error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -834,7 +885,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        struct sigqueue *q;
        int override_rlimit;
-        trace_sched_signal_send(sig, t);
+        trace_signal_generate(sig, info, t);
        assert_spin_locked(&t->sighand->siglock);
@@ -869,7 +920,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        else
                override_rlimit = 0;
-        q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
+        q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
                override_rlimit);
        if (q) {
                list_add_tail(&q->list, &pending->list);
@@ -896,12 +947,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        break;
                }
        } else if (!is_si_special(info)) {
-                if (sig >= SIGRTMIN && info->si_code != SI_USER)
+                if (sig >= SIGRTMIN && info->si_code != SI_USER) {
-                /*
+                        /*
-                 * Queue overflow, abort.  We may abort if the signal was rt
+                         * Queue overflow, abort.  We may abort if the
-                 * and sent by user using something other than kill().
+                         * signal was rt and sent by user using something
-                 */
+                         * other than kill().
+                         */
+                        trace_signal_overflow_fail(sig, group, info);
                        return -EAGAIN;
+                } else {
+                        /*
+                         * This is a silent loss of information.  We still
+                         * send the signal, but the *info bits are lost.
+                         */
+                        trace_signal_lose_info(sig, group, info);
+                }
        }
 out_set:
@@ -917,16 +977,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
        int from_ancestor_ns = 0;
 #ifdef CONFIG_PID_NS
-        if (!is_si_special(info) && SI_FROMUSER(info) &&
+        from_ancestor_ns = si_fromuser(info) &&
-                        task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
+                           !task_pid_nr_ns(current, task_active_pid_ns(t));
-                from_ancestor_ns = 1;
 #endif
        return __send_signal(sig, info, t, group, from_ancestor_ns);
 }
-int print_fatal_signals;
 static void print_fatal_signal(struct pt_regs *regs, int signr)
 {
        printk("%s/%d: potentially unexpected fatal signal %d.\n",
@@ -939,7 +996,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
                for (i = 0; i < 16; i++) {
                        unsigned char insn;
-                        __get_user(insn, (unsigned char *)(regs->ip + i));
+                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
+                                break;
                        printk("%02x ", insn);
                }
        }
@@ -1022,12 +1080,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
        return ret;
 }
-void
-force_sig_specific(int sig, struct task_struct *t)
-{
-        force_sig_info(sig, SEND_SIG_FORCED, t);
-}
 /*
 * Nuke all other threads in the group.
 */
@@ -1145,19 +1197,19 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
        int ret = -EINVAL;
        struct task_struct *p;
        const struct cred *pcred;
+        unsigned long flags;
        if (!valid_signal(sig))
                return ret;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = pid_task(pid, PIDTYPE_PID);
        if (!p) {
                ret = -ESRCH;
                goto out_unlock;
        }
        pcred = __task_cred(p);
-        if ((info == SEND_SIG_NOINFO ||
+        if (si_fromuser(info) &&
-             (!is_si_special(info) && SI_FROMUSER(info))) &&
            euid != pcred->suid && euid != pcred->uid &&
            uid  != pcred->suid && uid  != pcred->uid) {
                ret = -EPERM;
@@ -1166,14 +1218,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
        ret = security_task_kill(p, info, sig, secid);
        if (ret)
                goto out_unlock;
-        if (sig && p->sighand) {
-                unsigned long flags;
+        if (sig) {
-                spin_lock_irqsave(&p->sighand->siglock, flags);
+                if (lock_task_sighand(p, &flags)) {
-                ret = __send_signal(sig, info, p, 1, 0);
+                        ret = __send_signal(sig, info, p, 1, 0);
-                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                        unlock_task_sighand(p, &flags);
+                } else
+                        ret = -ESRCH;
        }
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
 EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
@@ -1293,19 +1347,19 @@ EXPORT_SYMBOL(kill_pid);
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
 * afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions".  In the case of Posix Timers 
+ * expirations or I/O completions".  In the case of Posix Timers
 * we allocate the sigqueue structure from the timer_create.  If this
 * allocation fails we are able to report the failure to the application
 * with an EAGAIN error.
 */
- 
 struct sigqueue *sigqueue_alloc(void)
 {
-        struct sigqueue *q;
+        struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
-        if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0)))
+        if (q)
                q->flags |= SIGQUEUE_PREALLOC;
-        return(q);
+        return q;
 }
 void sigqueue_free(struct sigqueue *q)
@@ -1807,11 +1861,6 @@ relock:
        for (;;) {
                struct k_sigaction *ka;
-                if (unlikely(signal->group_stop_count > 0) &&
-                    do_signal_stop(0))
-                        goto relock;
                /*
                 * Tracing can induce an artifical signal and choose sigaction.
                 * The return value in @signr determines the default action,
@@ -1823,6 +1872,10 @@ relock:
                if (unlikely(signr != 0))
                        ka = return_ka;
                else {
+                        if (unlikely(signal->group_stop_count > 0) &&
+                            do_signal_stop(0))
+                                goto relock;
                        signr = dequeue_signal(current, &current->blocked,
                                               info);
@@ -1839,6 +1892,9 @@ relock:
                        ka = &sighand->action[signr-1];
                }
+                /* Trace actually delivered signals. */
+                trace_signal_deliver(signr, info, ka);
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 00889bd3c590..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -49,7 +49,6 @@ static const int slow_work_max_vslow = 99;
 ctl_table slow_work_sysctls[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "min-threads",
                .data           = &slow_work_min_threads,
                .maxlen         = sizeof(unsigned),
@@ -59,7 +58,6 @@ ctl_table slow_work_sysctls[] = {
                .extra2         = &slow_work_max_threads,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "max-threads",
                .data           = &slow_work_max_threads,
                .maxlen         = sizeof(unsigned),
@@ -69,16 +67,15 @@ ctl_table slow_work_sysctls[] = {
                .extra2         = (void *) &slow_work_max_max_threads,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "vslow-percentage",
                .data           = &vslow_work_proportion,
                .maxlen         = sizeof(unsigned),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = (void *) &slow_work_min_vslow,
                .extra2         = (void *) &slow_work_max_vslow,
        },
-        { .ctl_name = 0 }
+        {}
 };
 #endif
@@ -640,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
                        goto cancelled;
                /* the timer holds a reference whilst it is pending */
-                ret = work->ops->get_ref(work);
+                ret = slow_work_get_ref(work);
                if (ret < 0)
                        goto cant_get_ref;
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
 */
 static inline void slow_work_set_thread_pid(int id, pid_t pid)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_pids[id] = pid;
 #endif
 }
 static inline void slow_work_mark_time(struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        work->mark = CURRENT_TIME;
 #endif
 }
 static inline void slow_work_begin_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_execs[id] = work;
 #endif
 }
 static inline void slow_work_end_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        write_lock(&slow_work_execs_lock);
        slow_work_execs[id] = NULL;
        write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index c9d1c7835c2f..3fc697336183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,18 +9,17 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
-static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
        struct list_head        queue;
-        spinlock_t              lock;
+        raw_spinlock_t          lock;
 } call_function __cacheline_aligned_in_smp =
        {
                .queue          = LIST_HEAD_INIT(call_function.queue),
-                .lock           = __SPIN_LOCK_UNLOCKED(call_function.lock),
+                .lock           = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
        };
 enum {
@@ -33,12 +32,14 @@ struct call_function_data {
        cpumask_var_t           cpumask;
 };
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 struct call_single_queue {
        struct list_head        list;
-        spinlock_t              lock;
+        raw_spinlock_t          lock;
 };
-static DEFINE_PER_CPU(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -80,7 +81,7 @@ static int __cpuinit init_call_single_data(void)
        for_each_possible_cpu(i) {
                struct call_single_queue *q = &per_cpu(call_single_queue, i);
-                spin_lock_init(&q->lock);
+                raw_spin_lock_init(&q->lock);
                INIT_LIST_HEAD(&q->list);
        }
@@ -141,10 +142,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
        unsigned long flags;
        int ipi;
-        spin_lock_irqsave(&dst->lock, flags);
+        raw_spin_lock_irqsave(&dst->lock, flags);
        ipi = list_empty(&dst->list);
        list_add_tail(&data->list, &dst->list);
-        spin_unlock_irqrestore(&dst->lock, flags);
+        raw_spin_unlock_irqrestore(&dst->lock, flags);
        /*
         * The list addition should be visible before sending the IPI
@@ -171,7 +172,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 void generic_smp_call_function_interrupt(void)
 {
        struct call_function_data *data;
-        int cpu = get_cpu();
+        int cpu = smp_processor_id();
        /*
         * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +202,9 @@ void generic_smp_call_function_interrupt(void)
                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);
                if (!refs) {
-                        spin_lock(&call_function.lock);
+                        raw_spin_lock(&call_function.lock);
                        list_del_rcu(&data->csd.list);
-                        spin_unlock(&call_function.lock);
+                        raw_spin_unlock(&call_function.lock);
                }
                if (refs)
@@ -212,7 +213,6 @@ void generic_smp_call_function_interrupt(void)
                csd_unlock(&data->csd);
        }
-        put_cpu();
 }
 /*
@@ -230,9 +230,9 @@ void generic_smp_call_function_single_interrupt(void)
         */
        WARN_ON_ONCE(!cpu_online(smp_processor_id()));
-        spin_lock(&q->lock);
+        raw_spin_lock(&q->lock);
        list_replace_init(&q->list, &list);
-        spin_unlock(&q->lock);
+        raw_spin_unlock(&q->lock);
        while (!list_empty(&list)) {
                struct call_single_data *data;
@@ -257,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void)
        }
 }
-static DEFINE_PER_CPU(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 /*
 * smp_call_function_single - Run a function on a specific CPU
@@ -265,9 +265,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed on other CPUs.
 *
- * Returns 0 on success, else a negative status code. Note that @wait
+ * Returns 0 on success, else a negative status code.
- * will be implicitly turned on in case of allocation failures, since
- * we fall back to on-stack allocation.
 */
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                             int wait)
@@ -321,6 +319,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
+/*
+ * smp_call_function_any - Run a function on any of the given cpus
+ * @mask: The mask of cpus it can run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait until function has completed.
+ *
+ * Returns 0 on success, else a negative status code (if no cpus were online).
+ * Note that @wait will be implicitly turned on in case of allocation failures,
+ * since we fall back to on-stack allocation.
+ *
+ * Selection preference:
+ *      1) current cpu if in @mask
+ *      2) any cpu of current node if in @mask
+ *      3) any other online cpu in @mask
+ */
+int smp_call_function_any(const struct cpumask *mask,
+                          void (*func)(void *info), void *info, int wait)
+{
+        unsigned int cpu;
+        const struct cpumask *nodemask;
+        int ret;
+        /* Try for same CPU (cheapest) */
+        cpu = get_cpu();
+        if (cpumask_test_cpu(cpu, mask))
+                goto call;
+        /* Try for same node. */
+        nodemask = cpumask_of_node(cpu_to_node(cpu));
+        for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
+             cpu = cpumask_next_and(cpu, nodemask, mask)) {
+                if (cpu_online(cpu))
+                        goto call;
+        }
+        /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
+        cpu = cpumask_any_and(mask, cpu_online_mask);
+call:
+        ret = smp_call_function_single(cpu, func, info, wait);
+        put_cpu();
+        return ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_function_any);
 /**
 * __smp_call_function_single(): Run a function on another CPU
 * @cpu: The CPU to run on.
@@ -355,9 +398,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
- * If @wait is true, then returns once @func has returned. Note that @wait
+ * If @wait is true, then returns once @func has returned.
- * will be implicitly turned on in case of allocation failures, since
- * we fall back to on-stack allocation.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler. Preemption
@@ -408,14 +449,14 @@ void smp_call_function_many(const struct cpumask *mask,
        cpumask_clear_cpu(this_cpu, data->cpumask);
        atomic_set(&data->refs, cpumask_weight(data->cpumask));
-        spin_lock_irqsave(&call_function.lock, flags);
+        raw_spin_lock_irqsave(&call_function.lock, flags);
        /*
         * Place entry at the _HEAD_ of the list, so that any cpu still
         * observing the entry in generic_smp_call_function_interrupt()
         * will not miss any other list entries:
         */
        list_add_rcu(&data->csd.list, &call_function.queue);
-        spin_unlock_irqrestore(&call_function.lock, flags);
+        raw_spin_unlock_irqrestore(&call_function.lock, flags);
        /*
         * Make the list addition visible before sending the ipi.
@@ -443,8 +484,7 @@ EXPORT_SYMBOL(smp_call_function_many);
 * Returns 0.
 *
 * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func. In case of allocation
+ * it returns just before the target cpu calls @func.
- * failure, @wait will be implicitly turned on.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
@@ -461,20 +501,20 @@ EXPORT_SYMBOL(smp_call_function);
 void ipi_call_lock(void)
 {
-        spin_lock(&call_function.lock);
+        raw_spin_lock(&call_function.lock);
 }
 void ipi_call_unlock(void)
 {
-        spin_unlock(&call_function.lock);
+        raw_spin_unlock(&call_function.lock);
 }
 void ipi_call_lock_irq(void)
 {
-        spin_lock_irq(&call_function.lock);
+        raw_spin_lock_irq(&call_function.lock);
 }
 void ipi_call_unlock_irq(void)
 {
-        spin_unlock_irq(&call_function.lock);
+        raw_spin_unlock_irq(&call_function.lock);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5216e0..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -302,9 +302,9 @@ void irq_exit(void)
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
+        rcu_irq_exit();
 #ifdef CONFIG_NO_HZ
        /* Make sure that timer wheel updates are propagated */
-        rcu_irq_exit();
        if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
                tick_nohz_stop_sched_tick(0);
 #endif
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
 */
 /*
- * The trampoline is called when the hrtimer expires. If this is
+ * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * called from the hrtimer interrupt then we schedule the tasklet as
+ * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * the timer callback function expects to run in softirq context. If
+ * hrtimer callback, but from softirq context.
- * it's called in softirq context anyway (i.e. high resolution timers
- * disabled) then the hrtimer callback is called right away.
 */
 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
 {
        struct tasklet_hrtimer *ttimer =
                container_of(timer, struct tasklet_hrtimer, timer);
-        if (hrtimer_is_hres_active(timer)) {
+        tasklet_hi_schedule(&ttimer->tasklet);
-                tasklet_hi_schedule(&ttimer->tasklet);
+        return HRTIMER_NORESTART;
-                return HRTIMER_NORESTART;
-        }
-        return ttimer->function(timer);
 }
 /*
@@ -697,7 +692,7 @@ void __init softirq_init(void)
        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
-static int ksoftirqd(void * __bind_cpu)
+static int run_ksoftirqd(void * __bind_cpu)
 {
        set_current_state(TASK_INTERRUPTIBLE);
@@ -810,7 +805,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+                p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb35..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,10 @@
 static DEFINE_SPINLOCK(print_lock);
-static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
-static DEFINE_PER_CPU(unsigned long, print_timestamp);
+static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
-static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
+static DEFINE_PER_CPU(bool, softlock_touch_sync);
 static int __read_mostly did_panic;
 int __read_mostly softlockup_thresh = 60;
@@ -70,22 +71,28 @@ static void __touch_softlockup_watchdog(void)
 {
        int this_cpu = raw_smp_processor_id();
-        __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
+        __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
 }
 void touch_softlockup_watchdog(void)
 {
-        __raw_get_cpu_var(touch_timestamp) = 0;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_softlockup_watchdog_sync(void)
+{
+        __raw_get_cpu_var(softlock_touch_sync) = true;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
+}
 void touch_all_softlockup_watchdogs(void)
 {
        int cpu;
        /* Cause each CPU to re-update its timestamp rather than complain */
        for_each_online_cpu(cpu)
-                per_cpu(touch_timestamp, cpu) = 0;
+                per_cpu(softlockup_touch_ts, cpu) = 0;
 }
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
@@ -104,28 +111,36 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 void softlockup_tick(void)
 {
        int this_cpu = smp_processor_id();
-        unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+        unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
-        unsigned long print_timestamp;
+        unsigned long print_ts;
        struct pt_regs *regs = get_irq_regs();
        unsigned long now;
        /* Is detection switched off? */
-        if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+        if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
                /* Be sure we don't false trigger if switched back on */
-                if (touch_timestamp)
+                if (touch_ts)
-                        per_cpu(touch_timestamp, this_cpu) = 0;
+                        per_cpu(softlockup_touch_ts, this_cpu) = 0;
                return;
        }
-        if (touch_timestamp == 0) {
+        if (touch_ts == 0) {
+                if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
+                        /*
+                         * If the time stamp was touched atomically
+                         * make sure the scheduler tick is up to date.
+                         */
+                        per_cpu(softlock_touch_sync, this_cpu) = false;
+                        sched_clock_tick();
+                }
                __touch_softlockup_watchdog();
                return;
        }
-        print_timestamp = per_cpu(print_timestamp, this_cpu);
+        print_ts = per_cpu(softlockup_print_ts, this_cpu);
        /* report at most once a second */
-        if (print_timestamp == touch_timestamp || did_panic)
+        if (print_ts == touch_ts || did_panic)
                return;
        /* do not print during early bootup: */
@@ -140,18 +155,18 @@ void softlockup_tick(void)
         * Wake up the high-prio watchdog task twice per
         * threshold timespan.
         */
-        if (now > touch_timestamp + softlockup_thresh/2)
+        if (time_after(now - softlockup_thresh/2, touch_ts))
-                wake_up_process(per_cpu(watchdog_task, this_cpu));
+                wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
        /* Warn about unreasonable delays: */
-        if (now <= (touch_timestamp + softlockup_thresh))
+        if (time_before_eq(now - softlockup_thresh, touch_ts))
                return;
-        per_cpu(print_timestamp, this_cpu) = touch_timestamp;
+        per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
        spin_lock(&print_lock);
        printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
-                        this_cpu, now - touch_timestamp,
+                        this_cpu, now - touch_ts,
                        current->comm, task_pid_nr(current));
        print_modules();
        print_irqtrace_events(current);
@@ -209,32 +224,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                BUG_ON(per_cpu(watchdog_task, hotcpu));
+                BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
                }
-                per_cpu(touch_timestamp, hotcpu) = 0;
+                per_cpu(softlockup_touch_ts, hotcpu) = 0;
-                per_cpu(watchdog_task, hotcpu) = p;
+                per_cpu(softlockup_watchdog, hotcpu) = p;
                kthread_bind(p, hotcpu);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                wake_up_process(per_cpu(watchdog_task, hotcpu));
+                wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                if (!per_cpu(watchdog_task, hotcpu))
+                if (!per_cpu(softlockup_watchdog, hotcpu))
                        break;
                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(watchdog_task, hotcpu),
+                kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                p = per_cpu(watchdog_task, hotcpu);
+                p = per_cpu(softlockup_watchdog, hotcpu);
-                per_cpu(watchdog_task, hotcpu) = NULL;
+                per_cpu(softlockup_watchdog, hotcpu) = NULL;
                kthread_stop(p);
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5ddab730cb2f..be6517fb9c14 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,193 +21,72 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
-#ifndef _spin_trylock
-int __lockfunc _spin_trylock(spinlock_t *lock)
-{
-        return __spin_trylock(lock);
-}
-EXPORT_SYMBOL(_spin_trylock);
-#endif
-#ifndef _read_trylock
-int __lockfunc _read_trylock(rwlock_t *lock)
-{
-        return __read_trylock(lock);
-}
-EXPORT_SYMBOL(_read_trylock);
-#endif
-#ifndef _write_trylock
-int __lockfunc _write_trylock(rwlock_t *lock)
-{
-        return __write_trylock(lock);
-}
-EXPORT_SYMBOL(_write_trylock);
-#endif
 /*
 * If lockdep is enabled then we use the non-preemption spin-ops
 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
 */
 #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
-#ifndef _read_lock
-void __lockfunc _read_lock(rwlock_t *lock)
-{
-        __read_lock(lock);
-}
-EXPORT_SYMBOL(_read_lock);
-#endif
-#ifndef _spin_lock_irqsave
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
-{
-        return __spin_lock_irqsave(lock);
-}
-EXPORT_SYMBOL(_spin_lock_irqsave);
-#endif
-#ifndef _spin_lock_irq
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
-{
-        __spin_lock_irq(lock);
-}
-EXPORT_SYMBOL(_spin_lock_irq);
-#endif
-#ifndef _spin_lock_bh
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
-{
-        __spin_lock_bh(lock);
-}
-EXPORT_SYMBOL(_spin_lock_bh);
-#endif
-#ifndef _read_lock_irqsave
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
-{
-        return __read_lock_irqsave(lock);
-}
-EXPORT_SYMBOL(_read_lock_irqsave);
-#endif
-#ifndef _read_lock_irq
-void __lockfunc _read_lock_irq(rwlock_t *lock)
-{
-        __read_lock_irq(lock);
-}
-EXPORT_SYMBOL(_read_lock_irq);
-#endif
-#ifndef _read_lock_bh
-void __lockfunc _read_lock_bh(rwlock_t *lock)
-{
-        __read_lock_bh(lock);
-}
-EXPORT_SYMBOL(_read_lock_bh);
-#endif
-#ifndef _write_lock_irqsave
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
-{
-        return __write_lock_irqsave(lock);
-}
-EXPORT_SYMBOL(_write_lock_irqsave);
-#endif
-#ifndef _write_lock_irq
-void __lockfunc _write_lock_irq(rwlock_t *lock)
-{
-        __write_lock_irq(lock);
-}
-EXPORT_SYMBOL(_write_lock_irq);
-#endif
-#ifndef _write_lock_bh
-void __lockfunc _write_lock_bh(rwlock_t *lock)
-{
-        __write_lock_bh(lock);
-}
-EXPORT_SYMBOL(_write_lock_bh);
-#endif
-#ifndef _spin_lock
-void __lockfunc _spin_lock(spinlock_t *lock)
-{
-        __spin_lock(lock);
-}
-EXPORT_SYMBOL(_spin_lock);
-#endif
-#ifndef _write_lock
-void __lockfunc _write_lock(rwlock_t *lock)
-{
-        __write_lock(lock);
-}
-EXPORT_SYMBOL(_write_lock);
-#endif
-#else /* CONFIG_PREEMPT: */
 /*
+ * The __lock_function inlines are taken from
+ * include/linux/spinlock_api_smp.h
+ */
+#else
+#define raw_read_can_lock(l)    read_can_lock(l)
+#define raw_write_can_lock(l)   write_can_lock(l)
+/*
+ * We build the __lock_function inlines here. They are too large for
+ * inlining all over the place, but here is only one user per function
+ * which embedds them into the calling _lock_function below.
+ *
 * This could be a long-held lock. We both prepare to spin for a long
 * time (making _this_ CPU preemptable if possible), and we also signal
 * towards that other CPU that it should break the lock ASAP.
- *
- * (We do this in a function because inlining it would be excessive.)
 */
 #define BUILD_LOCK_OPS(op, locktype)                                    \
-void __lockfunc _##op##_lock(locktype##_t *lock)                        \
+void __lockfunc __raw_##op##_lock(locktype##_t *lock)                   \
 {                                                                       \
        for (;;) {                                                      \
                preempt_disable();                                      \
-                if (likely(_raw_##op##_trylock(lock)))                  \
+                if (likely(do_raw_##op##_trylock(lock)))                \
                        break;                                          \
                preempt_enable();                                       \
                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
-                while (!op##_can_lock(lock) && (lock)->break_lock)      \
+                while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
-                        _raw_##op##_relax(&lock->raw_lock);             \
+                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
        (lock)->break_lock = 0;                                         \
 }                                                                       \
                                                                        \
-EXPORT_SYMBOL(_##op##_lock);                                            \
+unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)  \
-                                                                        \
-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)       \
 {                                                                       \
        unsigned long flags;                                            \
                                                                        \
        for (;;) {                                                      \
                preempt_disable();                                      \
                local_irq_save(flags);                                  \
-                if (likely(_raw_##op##_trylock(lock)))                  \
+                if (likely(do_raw_##op##_trylock(lock)))                \
                        break;                                          \
                local_irq_restore(flags);                               \
                preempt_enable();                                       \
                                                                        \
                if (!(lock)->break_lock)                                \
                        (lock)->break_lock = 1;                         \
-                while (!op##_can_lock(lock) && (lock)->break_lock)      \
+                while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
-                        _raw_##op##_relax(&lock->raw_lock);             \
+                        arch_##op##_relax(&lock->raw_lock);             \
        }                                                               \
        (lock)->break_lock = 0;                                         \
        return flags;                                                   \
 }                                                                       \
                                                                        \
-EXPORT_SYMBOL(_##op##_lock_irqsave);                                    \
+void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock)               \
-                                                                        \
-void __lockfunc _##op##_lock_irq(locktype##_t *lock)                    \
 {                                                                       \
-        _##op##_lock_irqsave(lock);                                     \
+        _raw_##op##_lock_irqsave(lock);                                 \
 }                                                                       \
                                                                        \
-EXPORT_SYMBOL(_##op##_lock_irq);                                        \
+void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)                \
-                                                                        \
-void __lockfunc _##op##_lock_bh(locktype##_t *lock)                     \
 {                                                                       \
        unsigned long flags;                                            \
                                                                        \
@@ -216,164 +95,283 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
        /* irq-disabling. We use the generic preemption-aware   */      \
        /* function:                                            */      \
        /**/                                                            \
-        flags = _##op##_lock_irqsave(lock);                             \
+        flags = _raw_##op##_lock_irqsave(lock);                         \
        local_bh_disable();                                             \
        local_irq_restore(flags);                                       \
 }                                                                       \
-                                                                        \
-EXPORT_SYMBOL(_##op##_lock_bh)
 /*
 * Build preemption-friendly versions of the following
 * lock-spinning functions:
 *
- *         _[spin|read|write]_lock()
+ *         __[spin|read|write]_lock()
- *         _[spin|read|write]_lock_irq()
+ *         __[spin|read|write]_lock_irq()
- *         _[spin|read|write]_lock_irqsave()
+ *         __[spin|read|write]_lock_irqsave()
- *         _[spin|read|write]_lock_bh()
+ *         __[spin|read|write]_lock_bh()
 */
-BUILD_LOCK_OPS(spin, spinlock);
+BUILD_LOCK_OPS(spin, raw_spinlock);
 BUILD_LOCK_OPS(read, rwlock);
 BUILD_LOCK_OPS(write, rwlock);
-#endif /* CONFIG_PREEMPT */
+#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
+{
+        return __raw_spin_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_trylock);
+#endif
-void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
 {
-        preempt_disable();
+        return __raw_spin_trylock_bh(lock);
-        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-EXPORT_SYMBOL(_spin_lock_nested);
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
+#endif
-unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+#ifndef CONFIG_INLINE_SPIN_LOCK
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
-        unsigned long flags;
+        __raw_spin_lock(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock);
+#endif
-        local_irq_save(flags);
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
-        preempt_disable();
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
-        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+{
-        LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+        return __raw_spin_lock_irqsave(lock);
-                                _raw_spin_lock_flags, &flags);
-        return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
+#endif
-void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
-                                     struct lockdep_map *nest_lock)
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
 {
-        preempt_disable();
+        __raw_spin_lock_irq(lock);
-        spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
-        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-EXPORT_SYMBOL(_spin_lock_nest_lock);
+EXPORT_SYMBOL(_raw_spin_lock_irq);
+#endif
+#ifndef CONFIG_INLINE_SPIN_LOCK_BH
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
+{
+        __raw_spin_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh);
 #endif
-#ifndef _spin_unlock
+#ifndef CONFIG_INLINE_SPIN_UNLOCK
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
-        __spin_unlock(lock);
+        __raw_spin_unlock(lock);
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(_raw_spin_unlock);
 #endif
-#ifndef _write_unlock
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
-        __write_unlock(lock);
+        __raw_spin_unlock_irqrestore(lock, flags);
 }
-EXPORT_SYMBOL(_write_unlock);
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
 #endif
-#ifndef _read_unlock
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
 {
-        __read_unlock(lock);
+        __raw_spin_unlock_irq(lock);
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
 #endif
-#ifndef _spin_unlock_irqrestore
+#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
 {
-        __spin_unlock_irqrestore(lock, flags);
+        __raw_spin_unlock_bh(lock);
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
 #endif
-#ifndef _spin_unlock_irq
+#ifndef CONFIG_INLINE_READ_TRYLOCK
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+int __lockfunc _raw_read_trylock(rwlock_t *lock)
 {
-        __spin_unlock_irq(lock);
+        return __raw_read_trylock(lock);
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(_raw_read_trylock);
 #endif
-#ifndef _spin_unlock_bh
+#ifndef CONFIG_INLINE_READ_LOCK
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc _raw_read_lock(rwlock_t *lock)
 {
-        __spin_unlock_bh(lock);
+        __raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(_raw_read_lock);
 #endif
-#ifndef _read_unlock_irqrestore
+#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
 {
-        __read_unlock_irqrestore(lock, flags);
+        return __raw_read_lock_irqsave(lock);
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
 #endif
-#ifndef _read_unlock_irq
+#ifndef CONFIG_INLINE_READ_LOCK_IRQ
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
 {
-        __read_unlock_irq(lock);
+        __raw_read_lock_irq(lock);
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(_raw_read_lock_irq);
 #endif
-#ifndef _read_unlock_bh
+#ifndef CONFIG_INLINE_READ_LOCK_BH
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
 {
-        __read_unlock_bh(lock);
+        __raw_read_lock_bh(lock);
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(_raw_read_lock_bh);
 #endif
-#ifndef _write_unlock_irqrestore
+#ifndef CONFIG_INLINE_READ_UNLOCK
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_read_unlock(rwlock_t *lock)
 {
-        __write_unlock_irqrestore(lock, flags);
+        __raw_read_unlock(lock);
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_unlock);
 #endif
-#ifndef _write_unlock_irq
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
 {
-        __write_unlock_irq(lock);
+        __raw_read_unlock_irqrestore(lock, flags);
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
 #endif
-#ifndef _write_unlock_bh
+#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
 {
-        __write_unlock_bh(lock);
+        __raw_read_unlock_irq(lock);
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(_raw_read_unlock_irq);
 #endif
-#ifndef _spin_trylock_bh
+#ifndef CONFIG_INLINE_READ_UNLOCK_BH
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
 {
-        return __spin_trylock_bh(lock);
+        __raw_read_unlock_bh(lock);
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(_raw_read_unlock_bh);
+#endif
+#ifndef CONFIG_INLINE_WRITE_TRYLOCK
+int __lockfunc _raw_write_trylock(rwlock_t *lock)
+{
+        return __raw_write_trylock(lock);
+}
+EXPORT_SYMBOL(_raw_write_trylock);
+#endif
+#ifndef CONFIG_INLINE_WRITE_LOCK
+void __lockfunc _raw_write_lock(rwlock_t *lock)
+{
+        __raw_write_lock(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock);
+#endif
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
+unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
+{
+        return __raw_write_lock_irqsave(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
+#endif
+#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
+void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
+{
+        __raw_write_lock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_irq);
+#endif
+#ifndef CONFIG_INLINE_WRITE_LOCK_BH
+void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
+{
+        __raw_write_lock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_write_lock_bh);
+#endif
+#ifndef CONFIG_INLINE_WRITE_UNLOCK
+void __lockfunc _raw_write_unlock(rwlock_t *lock)
+{
+        __raw_write_unlock(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock);
+#endif
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
+void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+{
+        __raw_write_unlock_irqrestore(lock, flags);
+}
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
+#endif
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
+void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
+{
+        __raw_write_unlock_irq(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock_irq);
+#endif
+#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
+void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
+{
+        __raw_write_unlock_bh(lock);
+}
+EXPORT_SYMBOL(_raw_write_unlock_bh);
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
+{
+        preempt_disable();
+        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_nested);
+unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
+                                                   int subclass)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        preempt_disable();
+        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+        LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
+                                do_raw_spin_lock_flags, &flags);
+        return flags;
+}
+EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
+void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
+                                     struct lockdep_map *nest_lock)
+{
+        preempt_disable();
+        spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+        LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
 #endif
 notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf22ce4..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,10 +30,33 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/srcu.h>
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+        sp->completed = 0;
+        mutex_init(&sp->mutex);
+        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
+        return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+                       struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        /* Don't re-initialize a lock while it is held. */
+        debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+        lockdep_init_map(&sp->dep_map, name, key, 0);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+        return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /**
 * init_srcu_struct - initialize a sleep-RCU structure
 * @sp: structure to initialize.
@@ -44,11 +67,11 @@
 */
 int init_srcu_struct(struct srcu_struct *sp)
 {
-        sp->completed = 0;
+        return init_srcu_struct_fields(sp);
-        mutex_init(&sp->mutex);
-        sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
-        return (sp->per_cpu_ref ? 0 : -ENOMEM);
 }
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
 * srcu_readers_active_idx -- returns approximate number of readers
@@ -97,16 +120,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
        free_percpu(sp->per_cpu_ref);
        sp->per_cpu_ref = NULL;
 }
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-/**
+/*
- * srcu_read_lock - register a new reader for an SRCU-protected structure.
- * @sp: srcu_struct in which to register the new reader.
- *
 * Counts the new reader in the appropriate per-CPU element of the
 * srcu_struct.  Must be called from process context.
 * Returns an index that must be passed to the matching srcu_read_unlock().
 */
-int srcu_read_lock(struct srcu_struct *sp)
+int __srcu_read_lock(struct srcu_struct *sp)
 {
        int idx;
@@ -118,40 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp)
        preempt_enable();
        return idx;
 }
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
-/**
+/*
- * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
- * @sp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock().
- *
 * Removes the count for the old reader from the appropriate per-CPU
 * element of the srcu_struct.  Note that this may well be a different
 * CPU than that which was incremented by the corresponding srcu_read_lock().
 * Must be called from process context.
 */
-void srcu_read_unlock(struct srcu_struct *sp, int idx)
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
        preempt_disable();
        srcu_barrier();  /* ensure compiler won't misorder critical section. */
        per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
        preempt_enable();
 }
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
-/**
+/*
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- * @sp: srcu_struct with which to synchronize.
- *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates.  Can block; must be called from
- * process context.
- *
- * Note that it is illegal to call synchornize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section.
 */
-void synchronize_srcu(struct srcu_struct *sp)
+static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 {
        int idx;
@@ -173,7 +181,7 @@ void synchronize_srcu(struct srcu_struct *sp)
                return;
        }
-        synchronize_sched();  /* Force memory barrier on all CPUs. */
+        sync_func();  /* Force memory barrier on all CPUs. */
        /*
         * The preceding synchronize_sched() ensures that any CPU that
@@ -190,7 +198,7 @@ void synchronize_srcu(struct srcu_struct *sp)
        idx = sp->completed & 0x1;
        sp->completed++;
-        synchronize_sched();  /* Force memory barrier on all CPUs. */
+        sync_func();  /* Force memory barrier on all CPUs. */
        /*
         * At this point, because of the preceding synchronize_sched(),
@@ -203,7 +211,7 @@ void synchronize_srcu(struct srcu_struct *sp)
        while (srcu_readers_active_idx(sp, idx))
                schedule_timeout_interruptible(1);
-        synchronize_sched();  /* Force memory barrier on all CPUs. */
+        sync_func();  /* Force memory barrier on all CPUs. */
        /*
         * The preceding synchronize_sched() forces all srcu_read_unlock()
@@ -237,6 +245,47 @@ void synchronize_srcu(struct srcu_struct *sp)
 }
 /**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+        __synchronize_srcu(sp, synchronize_sched);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+/**
+ * synchronize_srcu_expedited - like synchronize_srcu, but less patient
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Flip the completed counter, and wait for the old count to drain to zero.
+ * As with classic RCU, the updater must use some separate means of
+ * synchronizing concurrent updates.  Can block; must be called from
+ * process context.
+ *
+ * Note that it is illegal to call synchronize_srcu_expedited()
+ * from the corresponding SRCU read-side critical section; doing so
+ * will result in deadlock.  However, it is perfectly legal to call
+ * synchronize_srcu_expedited() on one srcu_struct from some other
+ * srcu_struct's read-side critical section.
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+        __synchronize_srcu(sp, synchronize_sched_expedited);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+/**
 * srcu_batches_completed - return batches completed.
 * @sp: srcu_struct on which to report batch completion.
 *
@@ -248,10 +297,4 @@ long srcu_batches_completed(struct srcu_struct *sp)
 {
        return sp->completed;
 }
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-EXPORT_SYMBOL_GPL(srcu_read_lock);
-EXPORT_SYMBOL_GPL(srcu_read_unlock);
-EXPORT_SYMBOL_GPL(synchronize_srcu);
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
 static struct workqueue_struct *stop_machine_wq;
 static struct stop_machine_data active, idle;
 static const struct cpumask *active_cpus;
-static void *stop_machine_work;
+static void __percpu *stop_machine_work;
 static void set_state(enum stopmachine_state newstate)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760d9c51..7cb426a58965 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
@@ -34,8 +33,10 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
+#include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/gfp.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -163,6 +164,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
        if (niceval > 19)
                niceval = 19;
+        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
                case PRIO_PROCESS:
@@ -190,16 +192,17 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
                                 !(user = find_user(who)))
                                goto out_unlock;        /* No processes for this user */
-                        do_each_thread(g, p)
+                        do_each_thread(g, p) {
                                if (__task_cred(p)->uid == who)
                                        error = set_one_prio(p, niceval, error);
-                        while_each_thread(g, p);
+                        } while_each_thread(g, p);
                        if (who != cred->uid)
                                free_uid(user);         /* For find_user() */
                        break;
        }
 out_unlock:
        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
 out:
        return error;
 }
@@ -221,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
        if (which > PRIO_USER || which < PRIO_PROCESS)
                return -EINVAL;
+        rcu_read_lock();
        read_lock(&tasklist_lock);
        switch (which) {
                case PRIO_PROCESS:
@@ -253,19 +257,20 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                                 !(user = find_user(who)))
                                goto out_unlock;        /* No processes for this user */
-                        do_each_thread(g, p)
+                        do_each_thread(g, p) {
                                if (__task_cred(p)->uid == who) {
                                        niceval = 20 - task_nice(p);
                                        if (niceval > retval)
                                                retval = niceval;
                                }
-                        while_each_thread(g, p);
+                        } while_each_thread(g, p);
                        if (who != cred->uid)
                                free_uid(user);         /* for find_user() */
                        break;
        }
 out_unlock:
        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return retval;
 }
@@ -349,6 +354,9 @@ void kernel_power_off(void)
        machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
+static DEFINE_MUTEX(reboot_mutex);
 /*
 * Reboot system call: for obvious reasons only root may call it,
 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +389,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
                cmd = LINUX_REBOOT_CMD_HALT;
-        lock_kernel();
+        mutex_lock(&reboot_mutex);
        switch (cmd) {
        case LINUX_REBOOT_CMD_RESTART:
                kernel_restart(NULL);
@@ -397,20 +405,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        case LINUX_REBOOT_CMD_HALT:
                kernel_halt();
-                unlock_kernel();
                do_exit(0);
                panic("cannot halt");
        case LINUX_REBOOT_CMD_POWER_OFF:
                kernel_power_off();
-                unlock_kernel();
                do_exit(0);
                break;
        case LINUX_REBOOT_CMD_RESTART2:
                if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
-                        unlock_kernel();
+                        ret = -EFAULT;
-                        return -EFAULT;
+                        break;
                }
                buffer[sizeof(buffer) - 1] = '\0';
@@ -433,7 +439,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                ret = -EINVAL;
                break;
        }
-        unlock_kernel();
+        mutex_unlock(&reboot_mutex);
        return ret;
 }
@@ -567,13 +573,7 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
-        if (!task_can_switch_user(new_user, current)) {
+        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                free_uid(new_user);
-                return -EINVAL;
-        }
-        if (atomic_read(&new_user->processes) >=
-                                current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
                        new_user != INIT_USER) {
                free_uid(new_user);
                return -EAGAIN;
@@ -911,16 +911,15 @@ change_okay:
 void do_sys_times(struct tms *tms)
 {
-        struct task_cputime cputime;
+        cputime_t tgutime, tgstime, cutime, cstime;
-        cputime_t cutime, cstime;
-        thread_group_cputime(current, &cputime);
        spin_lock_irq(&current->sighand->siglock);
+        thread_group_times(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        spin_unlock_irq(&current->sighand->siglock);
-        tms->tms_utime = cputime_to_clock_t(cputime.utime);
+        tms->tms_utime = cputime_to_clock_t(tgutime);
-        tms->tms_stime = cputime_to_clock_t(cputime.stime);
+        tms->tms_stime = cputime_to_clock_t(tgstime);
        tms->tms_cutime = cputime_to_clock_t(cutime);
        tms->tms_cstime = cputime_to_clock_t(cstime);
 }
@@ -1117,6 +1116,15 @@ out:
 DECLARE_RWSEM(uts_sem);
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+        (personality(current->personality) == PER_LINUX32 && \
+         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+                      sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name)     0
+#endif
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1125,9 +1133,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
        if (copy_to_user(name, utsname(), sizeof *name))
                errno = -EFAULT;
        up_read(&uts_sem);
+        if (!errno && override_architecture(name))
+                errno = -EFAULT;
        return errno;
 }
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+        int error = 0;
+        if (!name)
+                return -EFAULT;
+        down_read(&uts_sem);
+        if (copy_to_user(name, utsname(), sizeof(*name)))
+                error = -EFAULT;
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error;
+}
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+        int error;
+        if (!name)
+                return -EFAULT;
+        if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+                return -EFAULT;
+        down_read(&uts_sem);
+        error = __copy_to_user(&name->sysname, &utsname()->sysname,
+                               __OLD_UTS_LEN);
+        error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->release, &utsname()->release,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->release + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->version, &utsname()->version,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->version + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->machine, &utsname()->machine,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error ? -EFAULT : 0;
+}
+#endif
 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 {
        int errno;
@@ -1338,16 +1403,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
        struct task_struct *t;
        unsigned long flags;
-        cputime_t utime, stime;
+        cputime_t tgutime, tgstime, utime, stime;
-        struct task_cputime cputime;
        unsigned long maxrss = 0;
        memset((char *) r, 0, sizeof *r);
        utime = stime = cputime_zero;
        if (who == RUSAGE_THREAD) {
-                utime = task_utime(current);
+                task_times(current, &utime, &stime);
-                stime = task_stime(current);
                accumulate_thread_rusage(p, r);
                maxrss = p->signal->maxrss;
                goto out;
@@ -1373,9 +1436,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                                break;
                case RUSAGE_SELF:
-                        thread_group_cputime(p, &cputime);
+                        thread_group_times(p, &tgutime, &tgstime);
-                        utime = cputime_add(utime, cputime.utime);
+                        utime = cputime_add(utime, tgutime);
-                        stime = cputime_add(stime, cputime.stime);
+                        stime = cputime_add(stime, tgstime);
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8d1951..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -48,8 +48,10 @@ cond_syscall(sys_shutdown);
 cond_syscall(sys_sendmsg);
 cond_syscall(compat_sys_sendmsg);
 cond_syscall(sys_recvmsg);
+cond_syscall(sys_recvmmsg);
 cond_syscall(compat_sys_recvmsg);
 cond_syscall(compat_sys_recvfrom);
+cond_syscall(compat_sys_recvmmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
@@ -124,6 +126,7 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 cond_syscall(sys_flock);
@@ -139,7 +142,6 @@ cond_syscall(sys_pciconfig_read);
 cond_syscall(sys_pciconfig_write);
 cond_syscall(sys_pciconfig_iobase);
 cond_syscall(sys32_ipc);
-cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c517412..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,11 +23,11 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/kmemcheck.h>
-#include <linux/smp_lock.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -36,6 +36,7 @@
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
 #include <linux/writeback.h>
+#include <linux/ratelimit.h>
 #include <linux/hugetlb.h>
 #include <linux/initrd.h>
 #include <linux/key.h>
@@ -50,6 +51,7 @@
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
+#include <linux/kprobes.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -59,14 +61,23 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
-static int deprecated_sysctl_warning(struct __sysctl_args *args);
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
-extern int C_A_D;
-extern int print_fatal_signals;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
@@ -88,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-extern int rcutorture_runnable;
-#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 #ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
 #endif
@@ -120,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_MODULES
-extern char modprobe_path[];
-extern int modules_disabled;
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-extern int sg_big_buff;
-#endif
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -149,18 +149,12 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-extern int acct_parm[];
-#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
-#ifdef CONFIG_RT_MUTEXES
+extern struct ratelimit_state printk_ratelimit_state;
-extern int max_lock_depth;
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
@@ -200,38 +194,30 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
-extern int prove_locking;
-extern int lock_stat;
 /* The default sysctl tables: */
 static struct ctl_table root_table[] = {
        {
-                .ctl_name       = CTL_KERN,
                .procname       = "kernel",
                .mode           = 0555,
                .child          = kern_table,
        },
        {
-                .ctl_name       = CTL_VM,
                .procname       = "vm",
                .mode           = 0555,
                .child          = vm_table,
        },
        {
-                .ctl_name       = CTL_FS,
                .procname       = "fs",
                .mode           = 0555,
                .child          = fs_table,
        },
        {
-                .ctl_name       = CTL_DEBUG,
                .procname       = "debug",
                .mode           = 0555,
                .child          = debug_table,
        },
        {
-                .ctl_name       = CTL_DEV,
                .procname       = "dev",
                .mode           = 0555,
                .child          = dev_table,
@@ -240,7 +226,7 @@ static struct ctl_table root_table[] = {
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
 */
-        { .ctl_name = 0 }
+        { }
 };
 #ifdef CONFIG_SCHED_DEBUG
@@ -248,196 +234,178 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+static int min_sched_shares_ratelimit = 100000; /* 100 usec */
+static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 static struct ctl_table kern_table[] = {
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_child_runs_first",
                .data           = &sysctl_sched_child_runs_first,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_SCHED_DEBUG
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_min_granularity_ns",
                .data           = &sysctl_sched_min_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_latency_ns",
                .data           = &sysctl_sched_latency,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_wakeup_granularity_ns",
                .data           = &sysctl_sched_wakeup_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = sched_proc_update_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_shares_ratelimit",
                .data           = &sysctl_sched_shares_ratelimit,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = sched_proc_update_handler,
+                .extra1         = &min_sched_shares_ratelimit,
+                .extra2         = &max_sched_shares_ratelimit,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_tunable_scaling",
-                .procname       = "sched_shares_thresh",
+                .data           = &sysctl_sched_tunable_scaling,
-                .data           = &sysctl_sched_shares_thresh,
+                .maxlen         = sizeof(enum sched_tunable_scaling),
-                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = sched_proc_update_handler,
-                .strategy       = &sysctl_intvec,
+                .extra1         = &min_sched_tunable_scaling,
-                .extra1         = &zero,
+                .extra2         = &max_sched_tunable_scaling,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_shares_thresh",
-                .procname       = "sched_features",
+                .data           = &sysctl_sched_shares_thresh,
-                .data           = &sysctl_sched_features,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_nr_migrate",
                .data           = &sysctl_sched_nr_migrate,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_time_avg",
                .data           = &sysctl_sched_time_avg,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "timer_migration",
                .data           = &sysctl_timer_migration,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one,
        },
 #endif
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &sched_rt_handler,
+                .proc_handler   = sched_rt_handler,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_rt_runtime_us",
                .data           = &sysctl_sched_rt_runtime,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &sched_rt_handler,
+                .proc_handler   = sched_rt_handler,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_compat_yield",
                .data           = &sysctl_sched_compat_yield,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_PROVE_LOCKING
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "prove_locking",
                .data           = &prove_locking,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_LOCK_STAT
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "lock_stat",
                .data           = &lock_stat,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
        {
-                .ctl_name       = KERN_PANIC,
                .procname       = "panic",
                .data           = &panic_timeout,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = KERN_CORE_USES_PID,
                .procname       = "core_uses_pid",
                .data           = &core_uses_pid,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = KERN_CORE_PATTERN,
                .procname       = "core_pattern",
                .data           = core_pattern,
                .maxlen         = CORENAME_MAX_SIZE,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "core_pipe_limit",
                .data           = &core_pipe_limit,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
                .maxlen         = sizeof(long),
                .mode           = 0644,
-                .proc_handler   = &proc_taint,
+                .proc_handler   = proc_taint,
        },
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -446,181 +414,160 @@ static struct ctl_table kern_table[] = {
                .data           = &latencytop_enabled,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
        {
-                .ctl_name       = KERN_REALROOTDEV,
                .procname       = "real-root-dev",
                .data           = &real_root_dev,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "print-fatal-signals",
                .data           = &print_fatal_signals,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_SPARC
        {
-                .ctl_name       = KERN_SPARC_REBOOT,
                .procname       = "reboot-cmd",
                .data           = reboot_command,
                .maxlen         = 256,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
        {
-                .ctl_name       = KERN_SPARC_STOP_A,
                .procname       = "stop-a",
                .data           = &stop_a_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = KERN_SPARC_SCONS_PWROFF,
                .procname       = "scons-poweroff",
                .data           = &scons_pwroff,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_SPARC64
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "tsb-ratio",
                .data           = &sysctl_tsb_ratio,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef __hppa__
        {
-                .ctl_name       = KERN_HPPA_PWRSW,
                .procname       = "soft-power",
                .data           = &pwrsw_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = KERN_HPPA_UNALIGNED,
                .procname       = "unaligned-trap",
                .data           = &unaligned_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
        {
-                .ctl_name       = KERN_CTLALTDEL,
                .procname       = "ctrl-alt-del",
                .data           = &C_A_D,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #ifdef CONFIG_FUNCTION_TRACER
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "ftrace_enabled",
                .data           = &ftrace_enabled,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &ftrace_enable_sysctl,
+                .proc_handler   = ftrace_enable_sysctl,
        },
 #endif
 #ifdef CONFIG_STACK_TRACER
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "stack_tracer_enabled",
                .data           = &stack_tracer_enabled,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &stack_trace_sysctl,
+                .proc_handler   = stack_trace_sysctl,
        },
 #endif
 #ifdef CONFIG_TRACING
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "ftrace_dump_on_oops",
                .data           = &ftrace_dump_on_oops,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_MODULES
        {
-                .ctl_name       = KERN_MODPROBE,
                .procname       = "modprobe",
                .data           = &modprobe_path,
                .maxlen         = KMOD_PATH_LEN,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "modules_disabled",
                .data           = &modules_disabled,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                /* only handle a transition from default "0" to "1" */
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &one,
                .extra2         = &one,
        },
 #endif
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
        {
-                .ctl_name       = KERN_HOTPLUG,
                .procname       = "hotplug",
                .data           = &uevent_helper,
                .maxlen         = UEVENT_HELPER_PATH_LEN,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
 #endif
 #ifdef CONFIG_CHR_DEV_SG
        {
-                .ctl_name       = KERN_SG_BIG_BUFF,
                .procname       = "sg-big-buff",
                .data           = &sg_big_buff,
                .maxlen         = sizeof (int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
        {
-                .ctl_name       = KERN_ACCT,
                .procname       = "acct",
                .data           = &acct_parm,
                .maxlen         = 3*sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
        {
-                .ctl_name       = KERN_SYSRQ,
                .procname       = "sysrq",
                .data           = &__sysrq_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_PROC_SYSCTL
@@ -629,215 +576,188 @@ static struct ctl_table kern_table[] = {
                .data           = NULL,
                .maxlen         = sizeof (int),
                .mode           = 0600,
-                .proc_handler   = &proc_do_cad_pid,
+                .proc_handler   = proc_do_cad_pid,
        },
 #endif
        {
-                .ctl_name       = KERN_MAX_THREADS,
                .procname       = "threads-max",
                .data           = &max_threads,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = KERN_RANDOM,
                .procname       = "random",
                .mode           = 0555,
                .child          = random_table,
        },
        {
-                .ctl_name       = KERN_OVERFLOWUID,
                .procname       = "overflowuid",
                .data           = &overflowuid,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &minolduid,
                .extra2         = &maxolduid,
        },
        {
-                .ctl_name       = KERN_OVERFLOWGID,
                .procname       = "overflowgid",
                .data           = &overflowgid,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &minolduid,
                .extra2         = &maxolduid,
        },
 #ifdef CONFIG_S390
 #ifdef CONFIG_MATHEMU
        {
-                .ctl_name       = KERN_IEEE_EMULATION_WARNINGS,
                .procname       = "ieee_emulation_warnings",
                .data           = &sysctl_ieee_emulation_warnings,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
        {
-                .ctl_name       = KERN_S390_USER_DEBUG_LOGGING,
                .procname       = "userprocess_debug",
                .data           = &sysctl_userprocess_debug,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
        {
-                .ctl_name       = KERN_PIDMAX,
                .procname       = "pid_max",
                .data           = &pid_max,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = sysctl_intvec,
                .extra1         = &pid_max_min,
                .extra2         = &pid_max_max,
        },
        {
-                .ctl_name       = KERN_PANIC_ON_OOPS,
                .procname       = "panic_on_oops",
                .data           = &panic_on_oops,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #if defined CONFIG_PRINTK
        {
-                .ctl_name       = KERN_PRINTK,
                .procname       = "printk",
                .data           = &console_loglevel,
                .maxlen         = 4*sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = KERN_PRINTK_RATELIMIT,
                .procname       = "printk_ratelimit",
                .data           = &printk_ratelimit_state.interval,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
+                .proc_handler   = proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
        },
        {
-                .ctl_name       = KERN_PRINTK_RATELIMIT_BURST,
                .procname       = "printk_ratelimit_burst",
                .data           = &printk_ratelimit_state.burst,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "printk_delay",
                .data           = &printk_delay_msec,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &ten_thousand,
        },
 #endif
        {
-                .ctl_name       = KERN_NGROUPS_MAX,
                .procname       = "ngroups_max",
                .data           = &ngroups_max,
                .maxlen         = sizeof (int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
        {
-                .ctl_name       = KERN_UNKNOWN_NMI_PANIC,
                .procname       = "unknown_nmi_panic",
                .data           = &unknown_nmi_panic,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "nmi_watchdog",
                .data           = &nmi_watchdog_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_nmi_enabled,
+                .proc_handler   = proc_nmi_enabled,
        },
 #endif
 #if defined(CONFIG_X86)
        {
-                .ctl_name       = KERN_PANIC_ON_NMI,
                .procname       = "panic_on_unrecovered_nmi",
                .data           = &panic_on_unrecovered_nmi,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "panic_on_io_nmi",
                .data           = &panic_on_io_nmi,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = KERN_BOOTLOADER_TYPE,
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
                .maxlen         = sizeof (int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "bootloader_version",
                .data           = &bootloader_version,
                .maxlen         = sizeof (int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "kstack_depth_to_print",
                .data           = &kstack_depth_to_print,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "io_delay_type",
                .data           = &io_delay_type,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #if defined(CONFIG_MMU)
        {
-                .ctl_name       = KERN_RANDOMIZE,
                .procname       = "randomize_va_space",
                .data           = &randomize_va_space,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
        {
-                .ctl_name       = KERN_SPIN_RETRY,
                .procname       = "spin_retry",
                .data           = &spin_retry,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #if     defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
@@ -846,123 +766,104 @@ static struct ctl_table kern_table[] = {
                .data           = &acpi_realmode_flags,
                .maxlen         = sizeof (unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
        },
 #endif
 #ifdef CONFIG_IA64
        {
-                .ctl_name       = KERN_IA64_UNALIGNED,
                .procname       = "ignore-unaligned-usertrap",
                .data           = &no_unaligned_warning,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "unaligned-dump-stack",
                .data           = &unaligned_dump_stack,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "softlockup_panic",
                .data           = &softlockup_panic,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "softlockup_thresh",
                .data           = &softlockup_thresh,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dosoftlockup_thresh,
+                .proc_handler   = proc_dosoftlockup_thresh,
-                .strategy       = &sysctl_intvec,
                .extra1         = &neg_one,
                .extra2         = &sixty,
        },
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hung_task_panic",
                .data           = &sysctl_hung_task_panic,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hung_task_check_count",
                .data           = &sysctl_hung_task_check_count,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
-                .strategy       = &sysctl_intvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hung_task_timeout_secs",
                .data           = &sysctl_hung_task_timeout_secs,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_dohung_task_timeout_secs,
+                .proc_handler   = proc_dohung_task_timeout_secs,
-                .strategy       = &sysctl_intvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hung_task_warnings",
                .data           = &sysctl_hung_task_warnings,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
-                .strategy       = &sysctl_intvec,
        },
 #endif
 #ifdef CONFIG_COMPAT
        {
-                .ctl_name       = KERN_COMPAT_LOG,
                .procname       = "compat-log",
                .data           = &compat_log,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_RT_MUTEXES
        {
-                .ctl_name       = KERN_MAX_LOCK_DEPTH,
                .procname       = "max_lock_depth",
                .data           = &max_lock_depth,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "poweroff_cmd",
                .data           = &poweroff_cmd,
                .maxlen         = POWEROFF_CMD_PATH_LEN,
                .mode           = 0644,
-                .proc_handler   = &proc_dostring,
+                .proc_handler   = proc_dostring,
-                .strategy       = &sysctl_string,
        },
 #ifdef CONFIG_KEYS
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "keys",
                .mode           = 0555,
                .child          = key_sysctls,
@@ -970,17 +871,15 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_RCU_TORTURE_TEST
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "rcutorture_runnable",
                .data           = &rcutorture_runnable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_SLOW_WORK
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "slow-work",
                .mode           = 0555,
                .child          = slow_work_sysctls,
@@ -988,146 +887,127 @@ static struct ctl_table kern_table[] = {
 #endif
 #ifdef CONFIG_PERF_EVENTS
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "perf_event_paranoid",
                .data           = &sysctl_perf_event_paranoid,
                .maxlen         = sizeof(sysctl_perf_event_paranoid),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "perf_event_mlock_kb",
                .data           = &sysctl_perf_event_mlock,
                .maxlen         = sizeof(sysctl_perf_event_mlock),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "perf_event_max_sample_rate",
                .data           = &sysctl_perf_event_sample_rate,
                .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_KMEMCHECK
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "kmemcheck",
                .data           = &kmemcheck_enabled,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_BLOCK
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "blk_iopoll",
                .data           = &blk_iopoll_enabled,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 /*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
 */
-        { .ctl_name = 0 }
+        { }
 };
 static struct ctl_table vm_table[] = {
        {
-                .ctl_name       = VM_OVERCOMMIT_MEMORY,
                .procname       = "overcommit_memory",
                .data           = &sysctl_overcommit_memory,
                .maxlen         = sizeof(sysctl_overcommit_memory),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = VM_PANIC_ON_OOM,
                .procname       = "panic_on_oom",
                .data           = &sysctl_panic_on_oom,
                .maxlen         = sizeof(sysctl_panic_on_oom),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "oom_kill_allocating_task",
                .data           = &sysctl_oom_kill_allocating_task,
                .maxlen         = sizeof(sysctl_oom_kill_allocating_task),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "oom_dump_tasks",
                .data           = &sysctl_oom_dump_tasks,
                .maxlen         = sizeof(sysctl_oom_dump_tasks),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = VM_OVERCOMMIT_RATIO,
                .procname       = "overcommit_ratio",
                .data           = &sysctl_overcommit_ratio,
                .maxlen         = sizeof(sysctl_overcommit_ratio),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = VM_PAGE_CLUSTER,
                .procname       = "page-cluster", 
                .data           = &page_cluster,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = VM_DIRTY_BACKGROUND,
                .procname       = "dirty_background_ratio",
                .data           = &dirty_background_ratio,
                .maxlen         = sizeof(dirty_background_ratio),
                .mode           = 0644,
-                .proc_handler   = &dirty_background_ratio_handler,
+                .proc_handler   = dirty_background_ratio_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "dirty_background_bytes",
                .data           = &dirty_background_bytes,
                .maxlen         = sizeof(dirty_background_bytes),
                .mode           = 0644,
-                .proc_handler   = &dirty_background_bytes_handler,
+                .proc_handler   = dirty_background_bytes_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &one_ul,
        },
        {
-                .ctl_name       = VM_DIRTY_RATIO,
                .procname       = "dirty_ratio",
                .data           = &vm_dirty_ratio,
                .maxlen         = sizeof(vm_dirty_ratio),
                .mode           = 0644,
-                .proc_handler   = &dirty_ratio_handler,
+                .proc_handler   = dirty_ratio_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "dirty_bytes",
                .data           = &vm_dirty_bytes,
                .maxlen         = sizeof(vm_dirty_bytes),
                .mode           = 0644,
-                .proc_handler   = &dirty_bytes_handler,
+                .proc_handler   = dirty_bytes_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &dirty_bytes_min,
        },
        {
@@ -1135,289 +1015,258 @@ static struct ctl_table vm_table[] = {
                .data           = &dirty_writeback_interval,
                .maxlen         = sizeof(dirty_writeback_interval),
                .mode           = 0644,
-                .proc_handler   = &dirty_writeback_centisecs_handler,
+                .proc_handler   = dirty_writeback_centisecs_handler,
        },
        {
                .procname       = "dirty_expire_centisecs",
                .data           = &dirty_expire_interval,
                .maxlen         = sizeof(dirty_expire_interval),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = VM_NR_PDFLUSH_THREADS,
                .procname       = "nr_pdflush_threads",
                .data           = &nr_pdflush_threads,
                .maxlen         = sizeof nr_pdflush_threads,
                .mode           = 0444 /* read-only*/,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = VM_SWAPPINESS,
                .procname       = "swappiness",
                .data           = &vm_swappiness,
                .maxlen         = sizeof(vm_swappiness),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
 #ifdef CONFIG_HUGETLB_PAGE
-         {
+        {
                .procname       = "nr_hugepages",
                .data           = NULL,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &hugetlb_sysctl_handler,
+                .proc_handler   = hugetlb_sysctl_handler,
                .extra1         = (void *)&hugetlb_zero,
                .extra2         = (void *)&hugetlb_infinity,
-         },
+        },
+#ifdef CONFIG_NUMA
+        {
+                .procname       = "nr_hugepages_mempolicy",
+                .data           = NULL,
+                .maxlen         = sizeof(unsigned long),
+                .mode           = 0644,
+                .proc_handler   = &hugetlb_mempolicy_sysctl_handler,
+                .extra1         = (void *)&hugetlb_zero,
+                .extra2         = (void *)&hugetlb_infinity,
+        },
+#endif
         {
-                .ctl_name       = VM_HUGETLB_GROUP,
                .procname       = "hugetlb_shm_group",
                .data           = &sysctl_hugetlb_shm_group,
                .maxlen         = sizeof(gid_t),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
         },
         {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "hugepages_treat_as_movable",
                .data           = &hugepages_treat_as_movable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &hugetlb_treat_movable_handler,
+                .proc_handler   = hugetlb_treat_movable_handler,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_overcommit_hugepages",
                .data           = NULL,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &hugetlb_overcommit_handler,
+                .proc_handler   = hugetlb_overcommit_handler,
                .extra1         = (void *)&hugetlb_zero,
                .extra2         = (void *)&hugetlb_infinity,
        },
 #endif
        {
-                .ctl_name       = VM_LOWMEM_RESERVE_RATIO,
                .procname       = "lowmem_reserve_ratio",
                .data           = &sysctl_lowmem_reserve_ratio,
                .maxlen         = sizeof(sysctl_lowmem_reserve_ratio),
                .mode           = 0644,
-                .proc_handler   = &lowmem_reserve_ratio_sysctl_handler,
+                .proc_handler   = lowmem_reserve_ratio_sysctl_handler,
-                .strategy       = &sysctl_intvec,
        },
        {
-                .ctl_name       = VM_DROP_PAGECACHE,
                .procname       = "drop_caches",
                .data           = &sysctl_drop_caches,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = drop_caches_sysctl_handler,
-                .strategy       = &sysctl_intvec,
        },
        {
-                .ctl_name       = VM_MIN_FREE_KBYTES,
                .procname       = "min_free_kbytes",
                .data           = &min_free_kbytes,
                .maxlen         = sizeof(min_free_kbytes),
                .mode           = 0644,
-                .proc_handler   = &min_free_kbytes_sysctl_handler,
+                .proc_handler   = min_free_kbytes_sysctl_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = VM_PERCPU_PAGELIST_FRACTION,
                .procname       = "percpu_pagelist_fraction",
                .data           = &percpu_pagelist_fraction,
                .maxlen         = sizeof(percpu_pagelist_fraction),
                .mode           = 0644,
-                .proc_handler   = &percpu_pagelist_fraction_sysctl_handler,
+                .proc_handler   = percpu_pagelist_fraction_sysctl_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &min_percpu_pagelist_fract,
        },
 #ifdef CONFIG_MMU
        {
-                .ctl_name       = VM_MAX_MAP_COUNT,
                .procname       = "max_map_count",
                .data           = &sysctl_max_map_count,
                .maxlen         = sizeof(sysctl_max_map_count),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
 #else
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_trim_pages",
                .data           = &sysctl_nr_trim_pages,
                .maxlen         = sizeof(sysctl_nr_trim_pages),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
 #endif
        {
-                .ctl_name       = VM_LAPTOP_MODE,
                .procname       = "laptop_mode",
                .data           = &laptop_mode,
                .maxlen         = sizeof(laptop_mode),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
+                .proc_handler   = proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
        },
        {
-                .ctl_name       = VM_BLOCK_DUMP,
                .procname       = "block_dump",
                .data           = &block_dump,
                .maxlen         = sizeof(block_dump),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = VM_VFS_CACHE_PRESSURE,
                .procname       = "vfs_cache_pressure",
                .data           = &sysctl_vfs_cache_pressure,
                .maxlen         = sizeof(sysctl_vfs_cache_pressure),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
        {
-                .ctl_name       = VM_LEGACY_VA_LAYOUT,
                .procname       = "legacy_va_layout",
                .data           = &sysctl_legacy_va_layout,
                .maxlen         = sizeof(sysctl_legacy_va_layout),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
 #endif
 #ifdef CONFIG_NUMA
        {
-                .ctl_name       = VM_ZONE_RECLAIM_MODE,
                .procname       = "zone_reclaim_mode",
                .data           = &zone_reclaim_mode,
                .maxlen         = sizeof(zone_reclaim_mode),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
        {
-                .ctl_name       = VM_MIN_UNMAPPED,
                .procname       = "min_unmapped_ratio",
                .data           = &sysctl_min_unmapped_ratio,
                .maxlen         = sizeof(sysctl_min_unmapped_ratio),
                .mode           = 0644,
-                .proc_handler   = &sysctl_min_unmapped_ratio_sysctl_handler,
+                .proc_handler   = sysctl_min_unmapped_ratio_sysctl_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
        {
-                .ctl_name       = VM_MIN_SLAB,
                .procname       = "min_slab_ratio",
                .data           = &sysctl_min_slab_ratio,
                .maxlen         = sizeof(sysctl_min_slab_ratio),
                .mode           = 0644,
-                .proc_handler   = &sysctl_min_slab_ratio_sysctl_handler,
+                .proc_handler   = sysctl_min_slab_ratio_sysctl_handler,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
 #endif
 #ifdef CONFIG_SMP
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "stat_interval",
                .data           = &sysctl_stat_interval,
                .maxlen         = sizeof(sysctl_stat_interval),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
+                .proc_handler   = proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
        },
 #endif
+#ifdef CONFIG_MMU
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "mmap_min_addr",
                .data           = &dac_mmap_min_addr,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
-                .proc_handler   = &mmap_min_addr_handler,
+                .proc_handler   = mmap_min_addr_handler,
        },
+#endif
 #ifdef CONFIG_NUMA
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "numa_zonelist_order",
                .data           = &numa_zonelist_order,
                .maxlen         = NUMA_ZONELIST_ORDER_LEN,
                .mode           = 0644,
-                .proc_handler   = &numa_zonelist_order_handler,
+                .proc_handler   = numa_zonelist_order_handler,
-                .strategy       = &sysctl_string,
        },
 #endif
 #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
   (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
        {
-                .ctl_name       = VM_VDSO_ENABLED,
                .procname       = "vdso_enabled",
                .data           = &vdso_enabled,
                .maxlen         = sizeof(vdso_enabled),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
        },
 #endif
 #ifdef CONFIG_HIGHMEM
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "highmem_is_dirtyable",
                .data           = &vm_highmem_is_dirtyable,
                .maxlen         = sizeof(vm_highmem_is_dirtyable),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one,
        },
 #endif
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "scan_unevictable_pages",
                .data           = &scan_unevictable_pages,
                .maxlen         = sizeof(scan_unevictable_pages),
                .mode           = 0644,
-                .proc_handler   = &scan_unevictable_handler,
+                .proc_handler   = scan_unevictable_handler,
        },
 #ifdef CONFIG_MEMORY_FAILURE
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "memory_failure_early_kill",
                .data           = &sysctl_memory_failure_early_kill,
                .maxlen         = sizeof(sysctl_memory_failure_early_kill),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "memory_failure_recovery",
                .data           = &sysctl_memory_failure_recovery,
                .maxlen         = sizeof(sysctl_memory_failure_recovery),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &one,
        },
@@ -1427,116 +1276,104 @@ static struct ctl_table vm_table[] = {
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
 */
-        { .ctl_name = 0 }
+        { }
 };
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 static struct ctl_table binfmt_misc_table[] = {
-        { .ctl_name = 0 }
+        { }
 };
 #endif
 static struct ctl_table fs_table[] = {
        {
-                .ctl_name       = FS_NRINODE,
                .procname       = "inode-nr",
                .data           = &inodes_stat,
                .maxlen         = 2*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_STATINODE,
                .procname       = "inode-state",
                .data           = &inodes_stat,
                .maxlen         = 7*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "file-nr",
                .data           = &files_stat,
                .maxlen         = 3*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_nr_files,
+                .proc_handler   = proc_nr_files,
        },
        {
-                .ctl_name       = FS_MAXFILE,
                .procname       = "file-max",
                .data           = &files_stat.max_files,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "nr_open",
                .data           = &sysctl_nr_open,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &sysctl_nr_open_min,
                .extra2         = &sysctl_nr_open_max,
        },
        {
-                .ctl_name       = FS_DENTRY,
                .procname       = "dentry-state",
                .data           = &dentry_stat,
                .maxlen         = 6*sizeof(int),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
        {
-                .ctl_name       = FS_OVERFLOWUID,
                .procname       = "overflowuid",
                .data           = &fs_overflowuid,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &minolduid,
                .extra2         = &maxolduid,
        },
        {
-                .ctl_name       = FS_OVERFLOWGID,
                .procname       = "overflowgid",
                .data           = &fs_overflowgid,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &minolduid,
                .extra2         = &maxolduid,
        },
 #ifdef CONFIG_FILE_LOCKING
        {
-                .ctl_name       = FS_LEASES,
                .procname       = "leases-enable",
                .data           = &leases_enable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_DNOTIFY
        {
-                .ctl_name       = FS_DIR_NOTIFY,
                .procname       = "dir-notify-enable",
                .data           = &dir_notify_enable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_MMU
 #ifdef CONFIG_FILE_LOCKING
        {
-                .ctl_name       = FS_LEASE_TIME,
                .procname       = "lease-break-time",
                .data           = &lease_break_time,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = proc_dointvec,
        },
 #endif
 #ifdef CONFIG_AIO
@@ -1545,19 +1382,18 @@ static struct ctl_table fs_table[] = {
                .data           = &aio_nr,
                .maxlen         = sizeof(aio_nr),
                .mode           = 0444,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
        },
        {
                .procname       = "aio-max-nr",
                .data           = &aio_max_nr,
                .maxlen         = sizeof(aio_max_nr),
                .mode           = 0644,
-                .proc_handler   = &proc_doulongvec_minmax,
+                .proc_handler   = proc_doulongvec_minmax,
        },
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
        {
-                .ctl_name       = FS_INOTIFY,
                .procname       = "inotify",
                .mode           = 0555,
                .child          = inotify_table,
@@ -1572,19 +1408,16 @@ static struct ctl_table fs_table[] = {
 #endif
 #endif
        {
-                .ctl_name       = KERN_SETUID_DUMPABLE,
                .procname       = "suid_dumpable",
                .data           = &suid_dumpable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
                .extra1         = &zero,
                .extra2         = &two,
        },
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "binfmt_misc",
                .mode           = 0555,
                .child          = binfmt_misc_table,
@@ -1594,13 +1427,12 @@ static struct ctl_table fs_table[] = {
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
 */
-        { .ctl_name = 0 }
+        { }
 };
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
        {
-                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
                .maxlen         = sizeof(int),
@@ -1608,11 +1440,22 @@ static struct ctl_table debug_table[] = {
                .proc_handler   = proc_dointvec
        },
 #endif
-        { .ctl_name = 0 }
+#if defined(CONFIG_OPTPROBES)
+        {
+                .procname       = "kprobes-optimization",
+                .data           = &sysctl_kprobes_optimization,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_kprobes_optimization_handler,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
+        { }
 };
 static struct ctl_table dev_table[] = {
-        { .ctl_name = 0 }
+        { }
 };
 static DEFINE_SPINLOCK(sysctl_lock);
@@ -1766,122 +1609,6 @@ void register_sysctl_root(struct ctl_table_root *root)
        spin_unlock(&sysctl_lock);
 }
-#ifdef CONFIG_SYSCTL_SYSCALL
-/* Perform the actual read/write of a sysctl table entry. */
-static int do_sysctl_strategy(struct ctl_table_root *root,
-                        struct ctl_table *table,
-                        void __user *oldval, size_t __user *oldlenp,
-                        void __user *newval, size_t newlen)
-{
-        int op = 0, rc;
-        if (oldval)
-                op |= MAY_READ;
-        if (newval)
-                op |= MAY_WRITE;
-        if (sysctl_perm(root, table, op))
-                return -EPERM;
-        if (table->strategy) {
-                rc = table->strategy(table, oldval, oldlenp, newval, newlen);
-                if (rc < 0)
-                        return rc;
-                if (rc > 0)
-                        return 0;
-        }
-        /* If there is no strategy routine, or if the strategy returns
-         * zero, proceed with automatic r/w */
-        if (table->data && table->maxlen) {
-                rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
-                if (rc < 0)
-                        return rc;
-        }
-        return 0;
-}
-static int parse_table(int __user *name, int nlen,
-                       void __user *oldval, size_t __user *oldlenp,
-                       void __user *newval, size_t newlen,
-                       struct ctl_table_root *root,
-                       struct ctl_table *table)
-{
-        int n;
-repeat:
-        if (!nlen)
-                return -ENOTDIR;
-        if (get_user(n, name))
-                return -EFAULT;
-        for ( ; table->ctl_name || table->procname; table++) {
-                if (!table->ctl_name)
-                        continue;
-                if (n == table->ctl_name) {
-                        int error;
-                        if (table->child) {
-                                if (sysctl_perm(root, table, MAY_EXEC))
-                                        return -EPERM;
-                                name++;
-                                nlen--;
-                                table = table->child;
-                                goto repeat;
-                        }
-                        error = do_sysctl_strategy(root, table,
-                                                   oldval, oldlenp,
-                                                   newval, newlen);
-                        return error;
-                }
-        }
-        return -ENOTDIR;
-}
-int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
-               void __user *newval, size_t newlen)
-{
-        struct ctl_table_header *head;
-        int error = -ENOTDIR;
-        if (nlen <= 0 || nlen >= CTL_MAXNAME)
-                return -ENOTDIR;
-        if (oldval) {
-                int old_len;
-                if (!oldlenp || get_user(old_len, oldlenp))
-                        return -EFAULT;
-        }
-        for (head = sysctl_head_next(NULL); head;
-                        head = sysctl_head_next(head)) {
-                error = parse_table(name, nlen, oldval, oldlenp, 
-                                        newval, newlen,
-                                        head->root, head->ctl_table);
-                if (error != -ENOTDIR) {
-                        sysctl_head_finish(head);
-                        break;
-                }
-        }
-        return error;
-}
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
-        struct __sysctl_args tmp;
-        int error;
-        if (copy_from_user(&tmp, args, sizeof(tmp)))
-                return -EFAULT;
-        error = deprecated_sysctl_warning(&tmp);
-        if (error)
-                goto out;
-        lock_kernel();
-        error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp,
-                          tmp.newval, tmp.newlen);
-        unlock_kernel();
-out:
-        return error;
-}
-#endif /* CONFIG_SYSCTL_SYSCALL */
 /*
 * sysctl_perm does NOT grant the superuser all rights automatically, because
 * some sysctl variables are readonly even to root.
@@ -1917,7 +1644,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
 {
-        for (; table->ctl_name || table->procname; table++) {
+        for (; table->procname; table++) {
                table->parent = parent;
                if (table->child)
                        sysctl_set_parent(table, table->child);
@@ -1949,11 +1676,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch,
                return NULL;
        /* ... and nothing else */
-        if (branch[1].procname || branch[1].ctl_name)
+        if (branch[1].procname)
                return NULL;
        /* table should contain subdirectory with the same name */
-        for (p = table; p->procname || p->ctl_name; p++) {
+        for (p = table; p->procname; p++) {
                if (!p->child)
                        continue;
                if (p->procname && strcmp(p->procname, s) == 0)
@@ -1998,9 +1725,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
 *
 * The members of the &struct ctl_table structure are used as follows:
 *
- * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
- *            must be unique within that level of sysctl
- *
 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
 *            enter a sysctl file
 *
@@ -2015,8 +1739,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
 *
 * proc_handler - the text handler routine (described below)
 *
- * strategy - the strategy routine (described below)
- *
 * de - for internal use by the sysctl routines
 *
 * extra1, extra2 - extra pointers usable by the proc handler routines
@@ -2029,19 +1751,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
 * struct enable minimal validation of the values being written to be
 * performed, and the mode field allows minimal authentication.
 *
- * More sophisticated management can be enabled by the provision of a
- * strategy routine with the table entry.  This will be called before
- * any automatic read or write of the data is performed.
- *
- * The strategy routine may return
- *
- * < 0 - Error occurred (error is passed to user process)
- *
- * 0   - OK - proceed with automatic read or write.
- *
- * > 0 - OK - read or write has been done by the strategy routine, so
- *       return immediately.
- *
 * There must be a proc_handler routine for any terminal nodes
 * mirrored under /proc/sys (non-terminals are handled by a built-in
 * directory handler).  Several default handlers are available to
@@ -2068,13 +1777,13 @@ struct ctl_table_header *__register_sysctl_paths(
        struct ctl_table_set *set;
        /* Count the path components */
-        for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
+        for (npath = 0; path[npath].procname; ++npath)
                ;
        /*
         * For each path component, allocate a 2-element ctl_table array.
         * The first array element will be filled with the sysctl entry
-         * for this, the second will be the sentinel (ctl_name == 0).
+         * for this, the second will be the sentinel (procname == 0).
         *
         * We allocate everything in one go so that we don't have to
         * worry about freeing additional memory in unregister_sysctl_table.
@@ -2091,7 +1800,6 @@ struct ctl_table_header *__register_sysctl_paths(
        for (n = 0; n < npath; ++n, ++path) {
                /* Copy the procname */
                new->procname = path->procname;
-                new->ctl_name = path->ctl_name;
                new->mode     = 0555;
                *prevp = new;
@@ -2953,286 +2661,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 #endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SYSCTL_SYSCALL
-/*
- * General sysctl support routines 
- */
-/* The generic sysctl data routine (used if no strategy routine supplied) */
-int sysctl_data(struct ctl_table *table,
-                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen)
-{
-        size_t len;
-        /* Get out of I don't have a variable */
-        if (!table->data || !table->maxlen)
-                return -ENOTDIR;
-        if (oldval && oldlenp) {
-                if (get_user(len, oldlenp))
-                        return -EFAULT;
-                if (len) {
-                        if (len > table->maxlen)
-                                len = table->maxlen;
-                        if (copy_to_user(oldval, table->data, len))
-                                return -EFAULT;
-                        if (put_user(len, oldlenp))
-                                return -EFAULT;
-                }
-        }
-        if (newval && newlen) {
-                if (newlen > table->maxlen)
-                        newlen = table->maxlen;
-                if (copy_from_user(table->data, newval, newlen))
-                        return -EFAULT;
-        }
-        return 1;
-}
-/* The generic string strategy routine: */
-int sysctl_string(struct ctl_table *table,
-                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen)
-{
-        if (!table->data || !table->maxlen) 
-                return -ENOTDIR;
-        
-        if (oldval && oldlenp) {
-                size_t bufsize;
-                if (get_user(bufsize, oldlenp))
-                        return -EFAULT;
-                if (bufsize) {
-                        size_t len = strlen(table->data), copied;
-                        /* This shouldn't trigger for a well-formed sysctl */
-                        if (len > table->maxlen)
-                                len = table->maxlen;
-                        /* Copy up to a max of bufsize-1 bytes of the string */
-                        copied = (len >= bufsize) ? bufsize - 1 : len;
-                        if (copy_to_user(oldval, table->data, copied) ||
-                            put_user(0, (char __user *)(oldval + copied)))
-                                return -EFAULT;
-                        if (put_user(len, oldlenp))
-                                return -EFAULT;
-                }
-        }
-        if (newval && newlen) {
-                size_t len = newlen;
-                if (len > table->maxlen)
-                        len = table->maxlen;
-                if(copy_from_user(table->data, newval, len))
-                        return -EFAULT;
-                if (len == table->maxlen)
-                        len--;
-                ((char *) table->data)[len] = 0;
-        }
-        return 1;
-}
-/*
- * This function makes sure that all of the integers in the vector
- * are between the minimum and maximum values given in the arrays
- * table->extra1 and table->extra2, respectively.
- */
-int sysctl_intvec(struct ctl_table *table,
-                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen)
-{
-        if (newval && newlen) {
-                int __user *vec = (int __user *) newval;
-                int *min = (int *) table->extra1;
-                int *max = (int *) table->extra2;
-                size_t length;
-                int i;
-                if (newlen % sizeof(int) != 0)
-                        return -EINVAL;
-                if (!table->extra1 && !table->extra2)
-                        return 0;
-                if (newlen > table->maxlen)
-                        newlen = table->maxlen;
-                length = newlen / sizeof(int);
-                for (i = 0; i < length; i++) {
-                        int value;
-                        if (get_user(value, vec + i))
-                                return -EFAULT;
-                        if (min && value < min[i])
-                                return -EINVAL;
-                        if (max && value > max[i])
-                                return -EINVAL;
-                }
-        }
-        return 0;
-}
-/* Strategy function to convert jiffies to seconds */ 
-int sysctl_jiffies(struct ctl_table *table,
-                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen)
-{
-        if (oldval && oldlenp) {
-                size_t olen;
-                if (get_user(olen, oldlenp))
-                        return -EFAULT;
-                if (olen) {
-                        int val;
-                        if (olen < sizeof(int))
-                                return -EINVAL;
-                        val = *(int *)(table->data) / HZ;
-                        if (put_user(val, (int __user *)oldval))
-                                return -EFAULT;
-                        if (put_user(sizeof(int), oldlenp))
-                                return -EFAULT;
-                }
-        }
-        if (newval && newlen) { 
-                int new;
-                if (newlen != sizeof(int))
-                        return -EINVAL; 
-                if (get_user(new, (int __user *)newval))
-                        return -EFAULT;
-                *(int *)(table->data) = new*HZ; 
-        }
-        return 1;
-}
-/* Strategy function to convert jiffies to seconds */ 
-int sysctl_ms_jiffies(struct ctl_table *table,
-                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen)
-{
-        if (oldval && oldlenp) {
-                size_t olen;
-                if (get_user(olen, oldlenp))
-                        return -EFAULT;
-                if (olen) {
-                        int val;
-                        if (olen < sizeof(int))
-                                return -EINVAL;
-                        val = jiffies_to_msecs(*(int *)(table->data));
-                        if (put_user(val, (int __user *)oldval))
-                                return -EFAULT;
-                        if (put_user(sizeof(int), oldlenp))
-                                return -EFAULT;
-                }
-        }
-        if (newval && newlen) { 
-                int new;
-                if (newlen != sizeof(int))
-                        return -EINVAL; 
-                if (get_user(new, (int __user *)newval))
-                        return -EFAULT;
-                *(int *)(table->data) = msecs_to_jiffies(new);
-        }
-        return 1;
-}
-#else /* CONFIG_SYSCTL_SYSCALL */
-SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
-{
-        struct __sysctl_args tmp;
-        int error;
-        if (copy_from_user(&tmp, args, sizeof(tmp)))
-                return -EFAULT;
-        error = deprecated_sysctl_warning(&tmp);
-        /* If no error reading the parameters then just -ENOSYS ... */
-        if (!error)
-                error = -ENOSYS;
-        return error;
-}
-int sysctl_data(struct ctl_table *table,
-                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen)
-{
-        return -ENOSYS;
-}
-int sysctl_string(struct ctl_table *table,
-                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen)
-{
-        return -ENOSYS;
-}
-int sysctl_intvec(struct ctl_table *table,
-                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen)
-{
-        return -ENOSYS;
-}
-int sysctl_jiffies(struct ctl_table *table,
-                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen)
-{
-        return -ENOSYS;
-}
-int sysctl_ms_jiffies(struct ctl_table *table,
-                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen)
-{
-        return -ENOSYS;
-}
-#endif /* CONFIG_SYSCTL_SYSCALL */
-static int deprecated_sysctl_warning(struct __sysctl_args *args)
-{
-        static int msg_count;
-        int name[CTL_MAXNAME];
-        int i;
-        /* Check args->nlen. */
-        if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
-                return -ENOTDIR;
-        /* Read in the sysctl name for better debug message logging */
-        for (i = 0; i < args->nlen; i++)
-                if (get_user(name[i], args->name + i))
-                        return -EFAULT;
-        /* Ignore accesses to kernel.version */
-        if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
-                return 0;
-        if (msg_count < 5) {
-                msg_count++;
-                printk(KERN_INFO
-                        "warning: process `%s' used the deprecated sysctl "
-                        "system call with ", current->comm);
-                for (i = 0; i < args->nlen; i++)
-                        printk("%d.", name[i]);
-                printk("\n");
-        }
-        return 0;
-}
 /*
 * No sense putting this after each symbol definition, twice,
 * exception granted :-)
@@ -3247,9 +2675,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
 EXPORT_SYMBOL(register_sysctl_table);
 EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(sysctl_intvec);
-EXPORT_SYMBOL(sysctl_jiffies);
-EXPORT_SYMBOL(sysctl_ms_jiffies);
-EXPORT_SYMBOL(sysctl_string);
-EXPORT_SYMBOL(sysctl_data);
 EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
new file mode 100644
index 000000000000..59030570f5ca
--- /dev/null
+++ b/kernel/sysctl_binary.c
@@ -0,0 +1,1541 @@
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include <linux/sunrpc/debug.h>
+#include <linux/string.h>
+#include <net/ip_vs.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/file.h>
+#include <linux/ctype.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SYSCTL_SYSCALL
+struct bin_table;
+typedef ssize_t bin_convert_t(struct file *file,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
+static bin_convert_t bin_dir;
+static bin_convert_t bin_string;
+static bin_convert_t bin_intvec;
+static bin_convert_t bin_ulongvec;
+static bin_convert_t bin_uuid;
+static bin_convert_t bin_dn_node_address;
+#define CTL_DIR   bin_dir
+#define CTL_STR   bin_string
+#define CTL_INT   bin_intvec
+#define CTL_ULONG bin_ulongvec
+#define CTL_UUID  bin_uuid
+#define CTL_DNADR bin_dn_node_address
+#define BUFSZ 256
+struct bin_table {
+        bin_convert_t           *convert;
+        int                     ctl_name;
+        const char              *procname;
+        const struct bin_table  *child;
+};
+static const struct bin_table bin_random_table[] = {
+        { CTL_INT,      RANDOM_POOLSIZE,        "poolsize" },
+        { CTL_INT,      RANDOM_ENTROPY_COUNT,   "entropy_avail" },
+        { CTL_INT,      RANDOM_READ_THRESH,     "read_wakeup_threshold" },
+        { CTL_INT,      RANDOM_WRITE_THRESH,    "write_wakeup_threshold" },
+        { CTL_UUID,     RANDOM_BOOT_ID,         "boot_id" },
+        { CTL_UUID,     RANDOM_UUID,            "uuid" },
+        {}
+};
+static const struct bin_table bin_pty_table[] = {
+        { CTL_INT,      PTY_MAX,        "max" },
+        { CTL_INT,      PTY_NR,         "nr" },
+        {}
+};
+static const struct bin_table bin_kern_table[] = {
+        { CTL_STR,      KERN_OSTYPE,                    "ostype" },
+        { CTL_STR,      KERN_OSRELEASE,                 "osrelease" },
+        /* KERN_OSREV not used */
+        { CTL_STR,      KERN_VERSION,                   "version" },
+        /* KERN_SECUREMASK not used */
+        /* KERN_PROF not used */
+        { CTL_STR,      KERN_NODENAME,                  "hostname" },
+        { CTL_STR,      KERN_DOMAINNAME,                "domainname" },
+        { CTL_INT,      KERN_PANIC,                     "panic" },
+        { CTL_INT,      KERN_REALROOTDEV,               "real-root-dev" },
+        { CTL_STR,      KERN_SPARC_REBOOT,              "reboot-cmd" },
+        { CTL_INT,      KERN_CTLALTDEL,                 "ctrl-alt-del" },
+        { CTL_INT,      KERN_PRINTK,                    "printk" },
+        /* KERN_NAMETRANS not used */
+        /* KERN_PPC_HTABRECLAIM not used */
+        /* KERN_PPC_ZEROPAGED not used */
+        { CTL_INT,      KERN_PPC_POWERSAVE_NAP,         "powersave-nap" },
+        { CTL_STR,      KERN_MODPROBE,                  "modprobe" },
+        { CTL_INT,      KERN_SG_BIG_BUFF,               "sg-big-buff" },
+        { CTL_INT,      KERN_ACCT,                      "acct" },
+        /* KERN_PPC_L2CR "l2cr" no longer used */
+        /* KERN_RTSIGNR not used */
+        /* KERN_RTSIGMAX not used */
+        { CTL_ULONG,    KERN_SHMMAX,                    "shmmax" },
+        { CTL_INT,      KERN_MSGMAX,                    "msgmax" },
+        { CTL_INT,      KERN_MSGMNB,                    "msgmnb" },
+        /* KERN_MSGPOOL not used*/
+        { CTL_INT,      KERN_SYSRQ,                     "sysrq" },
+        { CTL_INT,      KERN_MAX_THREADS,               "threads-max" },
+        { CTL_DIR,      KERN_RANDOM,                    "random",       bin_random_table },
+        { CTL_ULONG,    KERN_SHMALL,                    "shmall" },
+        { CTL_INT,      KERN_MSGMNI,                    "msgmni" },
+        { CTL_INT,      KERN_SEM,                       "sem" },
+        { CTL_INT,      KERN_SPARC_STOP_A,              "stop-a" },
+        { CTL_INT,      KERN_SHMMNI,                    "shmmni" },
+        { CTL_INT,      KERN_OVERFLOWUID,               "overflowuid" },
+        { CTL_INT,      KERN_OVERFLOWGID,               "overflowgid" },
+        { CTL_STR,      KERN_HOTPLUG,                   "hotplug", },
+        { CTL_INT,      KERN_IEEE_EMULATION_WARNINGS,   "ieee_emulation_warnings" },
+        { CTL_INT,      KERN_S390_USER_DEBUG_LOGGING,   "userprocess_debug" },
+        { CTL_INT,      KERN_CORE_USES_PID,             "core_uses_pid" },
+        /* KERN_TAINTED "tainted" no longer used */
+        { CTL_INT,      KERN_CADPID,                    "cad_pid" },
+        { CTL_INT,      KERN_PIDMAX,                    "pid_max" },
+        { CTL_STR,      KERN_CORE_PATTERN,              "core_pattern" },
+        { CTL_INT,      KERN_PANIC_ON_OOPS,             "panic_on_oops" },
+        { CTL_INT,      KERN_HPPA_PWRSW,                "soft-power" },
+        { CTL_INT,      KERN_HPPA_UNALIGNED,            "unaligned-trap" },
+        { CTL_INT,      KERN_PRINTK_RATELIMIT,          "printk_ratelimit" },
+        { CTL_INT,      KERN_PRINTK_RATELIMIT_BURST,    "printk_ratelimit_burst" },
+        { CTL_DIR,      KERN_PTY,                       "pty",          bin_pty_table },
+        { CTL_INT,      KERN_NGROUPS_MAX,               "ngroups_max" },
+        { CTL_INT,      KERN_SPARC_SCONS_PWROFF,        "scons-poweroff" },
+        /* KERN_HZ_TIMER "hz_timer" no longer used */
+        { CTL_INT,      KERN_UNKNOWN_NMI_PANIC,         "unknown_nmi_panic" },
+        { CTL_INT,      KERN_BOOTLOADER_TYPE,           "bootloader_type" },
+        { CTL_INT,      KERN_RANDOMIZE,                 "randomize_va_space" },
+        { CTL_INT,      KERN_SPIN_RETRY,                "spin_retry" },
+        /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
+        { CTL_INT,      KERN_IA64_UNALIGNED,            "ignore-unaligned-usertrap" },
+        { CTL_INT,      KERN_COMPAT_LOG,                "compat-log" },
+        { CTL_INT,      KERN_MAX_LOCK_DEPTH,            "max_lock_depth" },
+        { CTL_INT,      KERN_NMI_WATCHDOG,              "nmi_watchdog" },
+        { CTL_INT,      KERN_PANIC_ON_NMI,              "panic_on_unrecovered_nmi" },
+        {}
+};
+static const struct bin_table bin_vm_table[] = {
+        { CTL_INT,      VM_OVERCOMMIT_MEMORY,           "overcommit_memory" },
+        { CTL_INT,      VM_PAGE_CLUSTER,                "page-cluster" },
+        { CTL_INT,      VM_DIRTY_BACKGROUND,            "dirty_background_ratio" },
+        { CTL_INT,      VM_DIRTY_RATIO,                 "dirty_ratio" },
+        /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
+        /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
+        { CTL_INT,      VM_NR_PDFLUSH_THREADS,          "nr_pdflush_threads" },
+        { CTL_INT,      VM_OVERCOMMIT_RATIO,            "overcommit_ratio" },
+        /* VM_PAGEBUF unused */
+        /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
+        { CTL_INT,      VM_SWAPPINESS,                  "swappiness" },
+        { CTL_INT,      VM_LOWMEM_RESERVE_RATIO,        "lowmem_reserve_ratio" },
+        { CTL_INT,      VM_MIN_FREE_KBYTES,             "min_free_kbytes" },
+        { CTL_INT,      VM_MAX_MAP_COUNT,               "max_map_count" },
+        { CTL_INT,      VM_LAPTOP_MODE,                 "laptop_mode" },
+        { CTL_INT,      VM_BLOCK_DUMP,                  "block_dump" },
+        { CTL_INT,      VM_HUGETLB_GROUP,               "hugetlb_shm_group" },
+        { CTL_INT,      VM_VFS_CACHE_PRESSURE,  "vfs_cache_pressure" },
+        { CTL_INT,      VM_LEGACY_VA_LAYOUT,            "legacy_va_layout" },
+        /* VM_SWAP_TOKEN_TIMEOUT unused */
+        { CTL_INT,      VM_DROP_PAGECACHE,              "drop_caches" },
+        { CTL_INT,      VM_PERCPU_PAGELIST_FRACTION,    "percpu_pagelist_fraction" },
+        { CTL_INT,      VM_ZONE_RECLAIM_MODE,           "zone_reclaim_mode" },
+        { CTL_INT,      VM_MIN_UNMAPPED,                "min_unmapped_ratio" },
+        { CTL_INT,      VM_PANIC_ON_OOM,                "panic_on_oom" },
+        { CTL_INT,      VM_VDSO_ENABLED,                "vdso_enabled" },
+        { CTL_INT,      VM_MIN_SLAB,                    "min_slab_ratio" },
+        {}
+};
+static const struct bin_table bin_net_core_table[] = {
+        { CTL_INT,      NET_CORE_WMEM_MAX,      "wmem_max" },
+        { CTL_INT,      NET_CORE_RMEM_MAX,      "rmem_max" },
+        { CTL_INT,      NET_CORE_WMEM_DEFAULT,  "wmem_default" },
+        { CTL_INT,      NET_CORE_RMEM_DEFAULT,  "rmem_default" },
+        /* NET_CORE_DESTROY_DELAY unused */
+        { CTL_INT,      NET_CORE_MAX_BACKLOG,   "netdev_max_backlog" },
+        /* NET_CORE_FASTROUTE unused */
+        { CTL_INT,      NET_CORE_MSG_COST,      "message_cost" },
+        { CTL_INT,      NET_CORE_MSG_BURST,     "message_burst" },
+        { CTL_INT,      NET_CORE_OPTMEM_MAX,    "optmem_max" },
+        /* NET_CORE_HOT_LIST_LENGTH unused */
+        /* NET_CORE_DIVERT_VERSION unused */
+        /* NET_CORE_NO_CONG_THRESH unused */
+        /* NET_CORE_NO_CONG unused */
+        /* NET_CORE_LO_CONG unused */
+        /* NET_CORE_MOD_CONG unused */
+        { CTL_INT,      NET_CORE_DEV_WEIGHT,    "dev_weight" },
+        { CTL_INT,      NET_CORE_SOMAXCONN,     "somaxconn" },
+        { CTL_INT,      NET_CORE_BUDGET,        "netdev_budget" },
+        { CTL_INT,      NET_CORE_AEVENT_ETIME,  "xfrm_aevent_etime" },
+        { CTL_INT,      NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" },
+        { CTL_INT,      NET_CORE_WARNINGS,      "warnings" },
+        {},
+};
+static const struct bin_table bin_net_unix_table[] = {
+        /* NET_UNIX_DESTROY_DELAY unused */
+        /* NET_UNIX_DELETE_DELAY unused */
+        { CTL_INT,      NET_UNIX_MAX_DGRAM_QLEN,        "max_dgram_qlen" },
+        {}
+};
+static const struct bin_table bin_net_ipv4_route_table[] = {
+        { CTL_INT,      NET_IPV4_ROUTE_FLUSH,                   "flush" },
+        /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
+        /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
+        { CTL_INT,      NET_IPV4_ROUTE_GC_THRESH,               "gc_thresh" },
+        { CTL_INT,      NET_IPV4_ROUTE_MAX_SIZE,                "max_size" },
+        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL,         "gc_min_interval" },
+        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,      "gc_min_interval_ms" },
+        { CTL_INT,      NET_IPV4_ROUTE_GC_TIMEOUT,              "gc_timeout" },
+        { CTL_INT,      NET_IPV4_ROUTE_GC_INTERVAL,             "gc_interval" },
+        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_LOAD,           "redirect_load" },
+        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_NUMBER,         "redirect_number" },
+        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_SILENCE,        "redirect_silence" },
+        { CTL_INT,      NET_IPV4_ROUTE_ERROR_COST,              "error_cost" },
+        { CTL_INT,      NET_IPV4_ROUTE_ERROR_BURST,             "error_burst" },
+        { CTL_INT,      NET_IPV4_ROUTE_GC_ELASTICITY,           "gc_elasticity" },
+        { CTL_INT,      NET_IPV4_ROUTE_MTU_EXPIRES,             "mtu_expires" },
+        { CTL_INT,      NET_IPV4_ROUTE_MIN_PMTU,                "min_pmtu" },
+        { CTL_INT,      NET_IPV4_ROUTE_MIN_ADVMSS,              "min_adv_mss" },
+        { CTL_INT,      NET_IPV4_ROUTE_SECRET_INTERVAL,         "secret_interval" },
+        {}
+};
+static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
+        { CTL_INT,      NET_IPV4_CONF_FORWARDING,               "forwarding" },
+        { CTL_INT,      NET_IPV4_CONF_MC_FORWARDING,            "mc_forwarding" },
+        { CTL_INT,      NET_IPV4_CONF_ACCEPT_REDIRECTS,         "accept_redirects" },
+        { CTL_INT,      NET_IPV4_CONF_SECURE_REDIRECTS,         "secure_redirects" },
+        { CTL_INT,      NET_IPV4_CONF_SEND_REDIRECTS,           "send_redirects" },
+        { CTL_INT,      NET_IPV4_CONF_SHARED_MEDIA,             "shared_media" },
+        { CTL_INT,      NET_IPV4_CONF_RP_FILTER,                "rp_filter" },
+        { CTL_INT,      NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,      "accept_source_route" },
+        { CTL_INT,      NET_IPV4_CONF_PROXY_ARP,                "proxy_arp" },
+        { CTL_INT,      NET_IPV4_CONF_MEDIUM_ID,                "medium_id" },
+        { CTL_INT,      NET_IPV4_CONF_BOOTP_RELAY,              "bootp_relay" },
+        { CTL_INT,      NET_IPV4_CONF_LOG_MARTIANS,             "log_martians" },
+        { CTL_INT,      NET_IPV4_CONF_TAG,                      "tag" },
+        { CTL_INT,      NET_IPV4_CONF_ARPFILTER,                "arp_filter" },
+        { CTL_INT,      NET_IPV4_CONF_ARP_ANNOUNCE,             "arp_announce" },
+        { CTL_INT,      NET_IPV4_CONF_ARP_IGNORE,               "arp_ignore" },
+        { CTL_INT,      NET_IPV4_CONF_ARP_ACCEPT,               "arp_accept" },
+        { CTL_INT,      NET_IPV4_CONF_ARP_NOTIFY,               "arp_notify" },
+        { CTL_INT,      NET_IPV4_CONF_NOXFRM,                   "disable_xfrm" },
+        { CTL_INT,      NET_IPV4_CONF_NOPOLICY,                 "disable_policy" },
+        { CTL_INT,      NET_IPV4_CONF_FORCE_IGMP_VERSION,       "force_igmp_version" },
+        { CTL_INT,      NET_IPV4_CONF_PROMOTE_SECONDARIES,      "promote_secondaries" },
+        {}
+};
+static const struct bin_table bin_net_ipv4_conf_table[] = {
+        { CTL_DIR,      NET_PROTO_CONF_ALL,     "all",          bin_net_ipv4_conf_vars_table },
+        { CTL_DIR,      NET_PROTO_CONF_DEFAULT, "default",      bin_net_ipv4_conf_vars_table },
+        { CTL_DIR,      0, NULL, bin_net_ipv4_conf_vars_table },
+        {}
+};
+static const struct bin_table bin_net_neigh_vars_table[] = {
+        { CTL_INT,      NET_NEIGH_MCAST_SOLICIT,        "mcast_solicit" },
+        { CTL_INT,      NET_NEIGH_UCAST_SOLICIT,        "ucast_solicit" },
+        { CTL_INT,      NET_NEIGH_APP_SOLICIT,          "app_solicit" },
+        /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
+        { CTL_INT,      NET_NEIGH_REACHABLE_TIME,       "base_reachable_time" },
+        { CTL_INT,      NET_NEIGH_DELAY_PROBE_TIME,     "delay_first_probe_time" },
+        { CTL_INT,      NET_NEIGH_GC_STALE_TIME,        "gc_stale_time" },
+        { CTL_INT,      NET_NEIGH_UNRES_QLEN,           "unres_qlen" },
+        { CTL_INT,      NET_NEIGH_PROXY_QLEN,           "proxy_qlen" },
+        /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
+        /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
+        /* NET_NEIGH_LOCKTIME "locktime" no longer used */
+        { CTL_INT,      NET_NEIGH_GC_INTERVAL,          "gc_interval" },
+        { CTL_INT,      NET_NEIGH_GC_THRESH1,           "gc_thresh1" },
+        { CTL_INT,      NET_NEIGH_GC_THRESH2,           "gc_thresh2" },
+        { CTL_INT,      NET_NEIGH_GC_THRESH3,           "gc_thresh3" },
+        { CTL_INT,      NET_NEIGH_RETRANS_TIME_MS,      "retrans_time_ms" },
+        { CTL_INT,      NET_NEIGH_REACHABLE_TIME_MS,    "base_reachable_time_ms" },
+        {}
+};
+static const struct bin_table bin_net_neigh_table[] = {
+        { CTL_DIR,      NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
+        { CTL_DIR,      0, NULL, bin_net_neigh_vars_table },
+        {}
+};
+static const struct bin_table bin_net_ipv4_netfilter_table[] = {
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_MAX,              "ip_conntrack_max" },
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_BUCKETS,          "ip_conntrack_buckets" },
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_LOG_INVALID,      "ip_conntrack_log_invalid" },
+        /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_TCP_LOOSE,        "ip_conntrack_tcp_loose" },
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,   "ip_conntrack_tcp_be_liberal" },
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,  "ip_conntrack_tcp_max_retrans" },
+        /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
+        /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_COUNT,            "ip_conntrack_count" },
+        { CTL_INT,      NET_IPV4_NF_CONNTRACK_CHECKSUM,         "ip_conntrack_checksum" },
+        {}
+};
+static const struct bin_table bin_net_ipv4_table[] = {
+        {CTL_INT,       NET_IPV4_FORWARD,                       "ip_forward" },
+        { CTL_DIR,      NET_IPV4_CONF,          "conf",         bin_net_ipv4_conf_table },
+        { CTL_DIR,      NET_IPV4_NEIGH,         "neigh",        bin_net_neigh_table },
+        { CTL_DIR,      NET_IPV4_ROUTE,         "route",        bin_net_ipv4_route_table },
+        /* NET_IPV4_FIB_HASH unused */
+        { CTL_DIR,      NET_IPV4_NETFILTER,     "netfilter",    bin_net_ipv4_netfilter_table },
+        { CTL_INT,      NET_IPV4_TCP_TIMESTAMPS,                "tcp_timestamps" },
+        { CTL_INT,      NET_IPV4_TCP_WINDOW_SCALING,            "tcp_window_scaling" },
+        { CTL_INT,      NET_IPV4_TCP_SACK,                      "tcp_sack" },
+        { CTL_INT,      NET_IPV4_TCP_RETRANS_COLLAPSE,          "tcp_retrans_collapse" },
+        { CTL_INT,      NET_IPV4_DEFAULT_TTL,                   "ip_default_ttl" },
+        /* NET_IPV4_AUTOCONFIG unused */
+        { CTL_INT,      NET_IPV4_NO_PMTU_DISC,                  "ip_no_pmtu_disc" },
+        { CTL_INT,      NET_IPV4_NONLOCAL_BIND,                 "ip_nonlocal_bind" },
+        { CTL_INT,      NET_IPV4_TCP_SYN_RETRIES,               "tcp_syn_retries" },
+        { CTL_INT,      NET_TCP_SYNACK_RETRIES,                 "tcp_synack_retries" },
+        { CTL_INT,      NET_TCP_MAX_ORPHANS,                    "tcp_max_orphans" },
+        { CTL_INT,      NET_TCP_MAX_TW_BUCKETS,                 "tcp_max_tw_buckets" },
+        { CTL_INT,      NET_IPV4_DYNADDR,                       "ip_dynaddr" },
+        { CTL_INT,      NET_IPV4_TCP_KEEPALIVE_TIME,            "tcp_keepalive_time" },
+        { CTL_INT,      NET_IPV4_TCP_KEEPALIVE_PROBES,          "tcp_keepalive_probes" },
+        { CTL_INT,      NET_IPV4_TCP_KEEPALIVE_INTVL,           "tcp_keepalive_intvl" },
+        { CTL_INT,      NET_IPV4_TCP_RETRIES1,                  "tcp_retries1" },
+        { CTL_INT,      NET_IPV4_TCP_RETRIES2,                  "tcp_retries2" },
+        { CTL_INT,      NET_IPV4_TCP_FIN_TIMEOUT,               "tcp_fin_timeout" },
+        { CTL_INT,      NET_TCP_SYNCOOKIES,                     "tcp_syncookies" },
+        { CTL_INT,      NET_TCP_TW_RECYCLE,                     "tcp_tw_recycle" },
+        { CTL_INT,      NET_TCP_ABORT_ON_OVERFLOW,              "tcp_abort_on_overflow" },
+        { CTL_INT,      NET_TCP_STDURG,                         "tcp_stdurg" },
+        { CTL_INT,      NET_TCP_RFC1337,                        "tcp_rfc1337" },
+        { CTL_INT,      NET_TCP_MAX_SYN_BACKLOG,                "tcp_max_syn_backlog" },
+        { CTL_INT,      NET_IPV4_LOCAL_PORT_RANGE,              "ip_local_port_range" },
+        { CTL_INT,      NET_IPV4_IGMP_MAX_MEMBERSHIPS,          "igmp_max_memberships" },
+        { CTL_INT,      NET_IPV4_IGMP_MAX_MSF,                  "igmp_max_msf" },
+        { CTL_INT,      NET_IPV4_INET_PEER_THRESHOLD,           "inet_peer_threshold" },
+        { CTL_INT,      NET_IPV4_INET_PEER_MINTTL,              "inet_peer_minttl" },
+        { CTL_INT,      NET_IPV4_INET_PEER_MAXTTL,              "inet_peer_maxttl" },
+        { CTL_INT,      NET_IPV4_INET_PEER_GC_MINTIME,          "inet_peer_gc_mintime" },
+        { CTL_INT,      NET_IPV4_INET_PEER_GC_MAXTIME,          "inet_peer_gc_maxtime" },
+        { CTL_INT,      NET_TCP_ORPHAN_RETRIES,                 "tcp_orphan_retries" },
+        { CTL_INT,      NET_TCP_FACK,                           "tcp_fack" },
+        { CTL_INT,      NET_TCP_REORDERING,                     "tcp_reordering" },
+        { CTL_INT,      NET_TCP_ECN,                            "tcp_ecn" },
+        { CTL_INT,      NET_TCP_DSACK,                          "tcp_dsack" },
+        { CTL_INT,      NET_TCP_MEM,                            "tcp_mem" },
+        { CTL_INT,      NET_TCP_WMEM,                           "tcp_wmem" },
+        { CTL_INT,      NET_TCP_RMEM,                           "tcp_rmem" },
+        { CTL_INT,      NET_TCP_APP_WIN,                        "tcp_app_win" },
+        { CTL_INT,      NET_TCP_ADV_WIN_SCALE,                  "tcp_adv_win_scale" },
+        { CTL_INT,      NET_TCP_TW_REUSE,                       "tcp_tw_reuse" },
+        { CTL_INT,      NET_TCP_FRTO,                           "tcp_frto" },
+        { CTL_INT,      NET_TCP_FRTO_RESPONSE,                  "tcp_frto_response" },
+        { CTL_INT,      NET_TCP_LOW_LATENCY,                    "tcp_low_latency" },
+        { CTL_INT,      NET_TCP_NO_METRICS_SAVE,                "tcp_no_metrics_save" },
+        { CTL_INT,      NET_TCP_MODERATE_RCVBUF,                "tcp_moderate_rcvbuf" },
+        { CTL_INT,      NET_TCP_TSO_WIN_DIVISOR,                "tcp_tso_win_divisor" },
+        { CTL_STR,      NET_TCP_CONG_CONTROL,                   "tcp_congestion_control" },
+        { CTL_INT,      NET_TCP_ABC,                            "tcp_abc" },
+        { CTL_INT,      NET_TCP_MTU_PROBING,                    "tcp_mtu_probing" },
+        { CTL_INT,      NET_TCP_BASE_MSS,                       "tcp_base_mss" },
+        { CTL_INT,      NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
+        { CTL_INT,      NET_TCP_DMA_COPYBREAK,                  "tcp_dma_copybreak" },
+        { CTL_INT,      NET_TCP_SLOW_START_AFTER_IDLE,          "tcp_slow_start_after_idle" },
+        { CTL_INT,      NET_CIPSOV4_CACHE_ENABLE,               "cipso_cache_enable" },
+        { CTL_INT,      NET_CIPSOV4_CACHE_BUCKET_SIZE,          "cipso_cache_bucket_size" },
+        { CTL_INT,      NET_CIPSOV4_RBM_OPTFMT,                 "cipso_rbm_optfmt" },
+        { CTL_INT,      NET_CIPSOV4_RBM_STRICTVALID,            "cipso_rbm_strictvalid" },
+        /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
+        { CTL_STR,      NET_TCP_ALLOWED_CONG_CONTROL,           "tcp_allowed_congestion_control" },
+        { CTL_INT,      NET_TCP_MAX_SSTHRESH,                   "tcp_max_ssthresh" },
+        { CTL_INT,      NET_IPV4_ICMP_ECHO_IGNORE_ALL,          "icmp_echo_ignore_all" },
+        { CTL_INT,      NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,   "icmp_echo_ignore_broadcasts" },
+        { CTL_INT,      NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,     "icmp_ignore_bogus_error_responses" },
+        { CTL_INT,      NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,        "icmp_errors_use_inbound_ifaddr" },
+        { CTL_INT,      NET_IPV4_ICMP_RATELIMIT,                "icmp_ratelimit" },
+        { CTL_INT,      NET_IPV4_ICMP_RATEMASK,                 "icmp_ratemask" },
+        { CTL_INT,      NET_IPV4_IPFRAG_HIGH_THRESH,            "ipfrag_high_thresh" },
+        { CTL_INT,      NET_IPV4_IPFRAG_LOW_THRESH,             "ipfrag_low_thresh" },
+        { CTL_INT,      NET_IPV4_IPFRAG_TIME,                   "ipfrag_time" },
+        { CTL_INT,      NET_IPV4_IPFRAG_SECRET_INTERVAL,        "ipfrag_secret_interval" },
+        /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
+        { CTL_INT,      2088 /* NET_IPQ_QMAX */,                "ip_queue_maxlen" },
+        /* NET_TCP_DEFAULT_WIN_SCALE unused */
+        /* NET_TCP_BIC_BETA unused */
+        /* NET_IPV4_TCP_MAX_KA_PROBES unused */
+        /* NET_IPV4_IP_MASQ_DEBUG unused */
+        /* NET_TCP_SYN_TAILDROP unused */
+        /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
+        /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
+        /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
+        /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
+        /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
+        /* NET_IPV4_ALWAYS_DEFRAG unused */
+        {}
+};
+static const struct bin_table bin_net_ipx_table[] = {
+        { CTL_INT,      NET_IPX_PPROP_BROADCASTING,     "ipx_pprop_broadcasting" },
+        /* NET_IPX_FORWARDING unused */
+        {}
+};
+static const struct bin_table bin_net_atalk_table[] = {
+        { CTL_INT,      NET_ATALK_AARP_EXPIRY_TIME,             "aarp-expiry-time" },
+        { CTL_INT,      NET_ATALK_AARP_TICK_TIME,               "aarp-tick-time" },
+        { CTL_INT,      NET_ATALK_AARP_RETRANSMIT_LIMIT,        "aarp-retransmit-limit" },
+        { CTL_INT,      NET_ATALK_AARP_RESOLVE_TIME,            "aarp-resolve-time" },
+        {},
+};
+static const struct bin_table bin_net_netrom_table[] = {
+        { CTL_INT,      NET_NETROM_DEFAULT_PATH_QUALITY,                "default_path_quality" },
+        { CTL_INT,      NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER,      "obsolescence_count_initialiser" },
+        { CTL_INT,      NET_NETROM_NETWORK_TTL_INITIALISER,             "network_ttl_initialiser" },
+        { CTL_INT,      NET_NETROM_TRANSPORT_TIMEOUT,                   "transport_timeout" },
+        { CTL_INT,      NET_NETROM_TRANSPORT_MAXIMUM_TRIES,             "transport_maximum_tries" },
+        { CTL_INT,      NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY,         "transport_acknowledge_delay" },
+        { CTL_INT,      NET_NETROM_TRANSPORT_BUSY_DELAY,                "transport_busy_delay" },
+        { CTL_INT,      NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE,     "transport_requested_window_size" },
+        { CTL_INT,      NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT,       "transport_no_activity_timeout" },
+        { CTL_INT,      NET_NETROM_ROUTING_CONTROL,                     "routing_control" },
+        { CTL_INT,      NET_NETROM_LINK_FAILS_COUNT,                    "link_fails_count" },
+        { CTL_INT,      NET_NETROM_RESET,                               "reset" },
+        {}
+};
+static const struct bin_table bin_net_ax25_param_table[] = {
+        { CTL_INT,      NET_AX25_IP_DEFAULT_MODE,       "ip_default_mode" },
+        { CTL_INT,      NET_AX25_DEFAULT_MODE,          "ax25_default_mode" },
+        { CTL_INT,      NET_AX25_BACKOFF_TYPE,          "backoff_type" },
+        { CTL_INT,      NET_AX25_CONNECT_MODE,          "connect_mode" },
+        { CTL_INT,      NET_AX25_STANDARD_WINDOW,       "standard_window_size" },
+        { CTL_INT,      NET_AX25_EXTENDED_WINDOW,       "extended_window_size" },
+        { CTL_INT,      NET_AX25_T1_TIMEOUT,            "t1_timeout" },
+        { CTL_INT,      NET_AX25_T2_TIMEOUT,            "t2_timeout" },
+        { CTL_INT,      NET_AX25_T3_TIMEOUT,            "t3_timeout" },
+        { CTL_INT,      NET_AX25_IDLE_TIMEOUT,          "idle_timeout" },
+        { CTL_INT,      NET_AX25_N2,                    "maximum_retry_count" },
+        { CTL_INT,      NET_AX25_PACLEN,                "maximum_packet_length" },
+        { CTL_INT,      NET_AX25_PROTOCOL,              "protocol" },
+        { CTL_INT,      NET_AX25_DAMA_SLAVE_TIMEOUT,    "dama_slave_timeout" },
+        {}
+};
+static const struct bin_table bin_net_ax25_table[] = {
+        { CTL_DIR,      0, NULL, bin_net_ax25_param_table },
+        {}
+};
+static const struct bin_table bin_net_rose_table[] = {
+        { CTL_INT,      NET_ROSE_RESTART_REQUEST_TIMEOUT,       "restart_request_timeout" },
+        { CTL_INT,      NET_ROSE_CALL_REQUEST_TIMEOUT,          "call_request_timeout" },
+        { CTL_INT,      NET_ROSE_RESET_REQUEST_TIMEOUT,         "reset_request_timeout" },
+        { CTL_INT,      NET_ROSE_CLEAR_REQUEST_TIMEOUT,         "clear_request_timeout" },
+        { CTL_INT,      NET_ROSE_ACK_HOLD_BACK_TIMEOUT,         "acknowledge_hold_back_timeout" },
+        { CTL_INT,      NET_ROSE_ROUTING_CONTROL,               "routing_control" },
+        { CTL_INT,      NET_ROSE_LINK_FAIL_TIMEOUT,             "link_fail_timeout" },
+        { CTL_INT,      NET_ROSE_MAX_VCS,                       "maximum_virtual_circuits" },
+        { CTL_INT,      NET_ROSE_WINDOW_SIZE,                   "window_size" },
+        { CTL_INT,      NET_ROSE_NO_ACTIVITY_TIMEOUT,           "no_activity_timeout" },
+        {}
+};
+static const struct bin_table bin_net_ipv6_conf_var_table[] = {
+        { CTL_INT,      NET_IPV6_FORWARDING,                    "forwarding" },
+        { CTL_INT,      NET_IPV6_HOP_LIMIT,                     "hop_limit" },
+        { CTL_INT,      NET_IPV6_MTU,                           "mtu" },
+        { CTL_INT,      NET_IPV6_ACCEPT_RA,                     "accept_ra" },
+        { CTL_INT,      NET_IPV6_ACCEPT_REDIRECTS,              "accept_redirects" },
+        { CTL_INT,      NET_IPV6_AUTOCONF,                      "autoconf" },
+        { CTL_INT,      NET_IPV6_DAD_TRANSMITS,                 "dad_transmits" },
+        { CTL_INT,      NET_IPV6_RTR_SOLICITS,                  "router_solicitations" },
+        { CTL_INT,      NET_IPV6_RTR_SOLICIT_INTERVAL,          "router_solicitation_interval" },
+        { CTL_INT,      NET_IPV6_RTR_SOLICIT_DELAY,             "router_solicitation_delay" },
+        { CTL_INT,      NET_IPV6_USE_TEMPADDR,                  "use_tempaddr" },
+        { CTL_INT,      NET_IPV6_TEMP_VALID_LFT,                "temp_valid_lft" },
+        { CTL_INT,      NET_IPV6_TEMP_PREFERED_LFT,             "temp_prefered_lft" },
+        { CTL_INT,      NET_IPV6_REGEN_MAX_RETRY,               "regen_max_retry" },
+        { CTL_INT,      NET_IPV6_MAX_DESYNC_FACTOR,             "max_desync_factor" },
+        { CTL_INT,      NET_IPV6_MAX_ADDRESSES,                 "max_addresses" },
+        { CTL_INT,      NET_IPV6_FORCE_MLD_VERSION,             "force_mld_version" },
+        { CTL_INT,      NET_IPV6_ACCEPT_RA_DEFRTR,              "accept_ra_defrtr" },
+        { CTL_INT,      NET_IPV6_ACCEPT_RA_PINFO,               "accept_ra_pinfo" },
+        { CTL_INT,      NET_IPV6_ACCEPT_RA_RTR_PREF,            "accept_ra_rtr_pref" },
+        { CTL_INT,      NET_IPV6_RTR_PROBE_INTERVAL,            "router_probe_interval" },
+        { CTL_INT,      NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,    "accept_ra_rt_info_max_plen" },
+        { CTL_INT,      NET_IPV6_PROXY_NDP,                     "proxy_ndp" },
+        { CTL_INT,      NET_IPV6_ACCEPT_SOURCE_ROUTE,           "accept_source_route" },
+        {}
+};
+static const struct bin_table bin_net_ipv6_conf_table[] = {
+        { CTL_DIR,      NET_PROTO_CONF_ALL,             "all",  bin_net_ipv6_conf_var_table },
+        { CTL_DIR,      NET_PROTO_CONF_DEFAULT,         "default", bin_net_ipv6_conf_var_table },
+        { CTL_DIR,      0, NULL, bin_net_ipv6_conf_var_table },
+        {}
+};
+static const struct bin_table bin_net_ipv6_route_table[] = {
+        /* NET_IPV6_ROUTE_FLUSH "flush"  no longer used */
+        { CTL_INT,      NET_IPV6_ROUTE_GC_THRESH,               "gc_thresh" },
+        { CTL_INT,      NET_IPV6_ROUTE_MAX_SIZE,                "max_size" },
+        { CTL_INT,      NET_IPV6_ROUTE_GC_MIN_INTERVAL,         "gc_min_interval" },
+        { CTL_INT,      NET_IPV6_ROUTE_GC_TIMEOUT,              "gc_timeout" },
+        { CTL_INT,      NET_IPV6_ROUTE_GC_INTERVAL,             "gc_interval" },
+        { CTL_INT,      NET_IPV6_ROUTE_GC_ELASTICITY,           "gc_elasticity" },
+        { CTL_INT,      NET_IPV6_ROUTE_MTU_EXPIRES,             "mtu_expires" },
+        { CTL_INT,      NET_IPV6_ROUTE_MIN_ADVMSS,              "min_adv_mss" },
+        { CTL_INT,      NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,      "gc_min_interval_ms" },
+        {}
+};
+static const struct bin_table bin_net_ipv6_icmp_table[] = {
+        { CTL_INT,      NET_IPV6_ICMP_RATELIMIT,        "ratelimit" },
+        {}
+};
+static const struct bin_table bin_net_ipv6_table[] = {
+        { CTL_DIR,      NET_IPV6_CONF,          "conf",         bin_net_ipv6_conf_table },
+        { CTL_DIR,      NET_IPV6_NEIGH,         "neigh",        bin_net_neigh_table },
+        { CTL_DIR,      NET_IPV6_ROUTE,         "route",        bin_net_ipv6_route_table },
+        { CTL_DIR,      NET_IPV6_ICMP,          "icmp",         bin_net_ipv6_icmp_table },
+        { CTL_INT,      NET_IPV6_BINDV6ONLY,            "bindv6only" },
+        { CTL_INT,      NET_IPV6_IP6FRAG_HIGH_THRESH,   "ip6frag_high_thresh" },
+        { CTL_INT,      NET_IPV6_IP6FRAG_LOW_THRESH,    "ip6frag_low_thresh" },
+        { CTL_INT,      NET_IPV6_IP6FRAG_TIME,          "ip6frag_time" },
+        { CTL_INT,      NET_IPV6_IP6FRAG_SECRET_INTERVAL,       "ip6frag_secret_interval" },
+        { CTL_INT,      NET_IPV6_MLD_MAX_MSF,           "mld_max_msf" },
+        { CTL_INT,      2088 /* IPQ_QMAX */,            "ip6_queue_maxlen" },
+        {}
+};
+static const struct bin_table bin_net_x25_table[] = {
+        { CTL_INT,      NET_X25_RESTART_REQUEST_TIMEOUT,        "restart_request_timeout" },
+        { CTL_INT,      NET_X25_CALL_REQUEST_TIMEOUT,           "call_request_timeout" },
+        { CTL_INT,      NET_X25_RESET_REQUEST_TIMEOUT,  "reset_request_timeout" },
+        { CTL_INT,      NET_X25_CLEAR_REQUEST_TIMEOUT,  "clear_request_timeout" },
+        { CTL_INT,      NET_X25_ACK_HOLD_BACK_TIMEOUT,  "acknowledgement_hold_back_timeout" },
+        { CTL_INT,      NET_X25_FORWARD,                        "x25_forward" },
+        {}
+};
+static const struct bin_table bin_net_tr_table[] = {
+        { CTL_INT,      NET_TR_RIF_TIMEOUT,     "rif_timeout" },
+        {}
+};
+static const struct bin_table bin_net_decnet_conf_vars[] = {
+        { CTL_INT,      NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
+        { CTL_INT,      NET_DECNET_CONF_DEV_PRIORITY,   "priority" },
+        { CTL_INT,      NET_DECNET_CONF_DEV_T2,         "t2" },
+        { CTL_INT,      NET_DECNET_CONF_DEV_T3,         "t3" },
+        {}
+};
+static const struct bin_table bin_net_decnet_conf[] = {
+        { CTL_DIR, NET_DECNET_CONF_ETHER,    "ethernet", bin_net_decnet_conf_vars },
+        { CTL_DIR, NET_DECNET_CONF_GRE,      "ipgre",    bin_net_decnet_conf_vars },
+        { CTL_DIR, NET_DECNET_CONF_X25,      "x25",      bin_net_decnet_conf_vars },
+        { CTL_DIR, NET_DECNET_CONF_PPP,      "ppp",      bin_net_decnet_conf_vars },
+        { CTL_DIR, NET_DECNET_CONF_DDCMP,    "ddcmp",    bin_net_decnet_conf_vars },
+        { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
+        { CTL_DIR, 0,                        NULL,       bin_net_decnet_conf_vars },
+        {}
+};
+static const struct bin_table bin_net_decnet_table[] = {
+        { CTL_DIR,      NET_DECNET_CONF,                "conf", bin_net_decnet_conf },
+        { CTL_DNADR,    NET_DECNET_NODE_ADDRESS,        "node_address" },
+        { CTL_STR,      NET_DECNET_NODE_NAME,           "node_name" },
+        { CTL_STR,      NET_DECNET_DEFAULT_DEVICE,      "default_device" },
+        { CTL_INT,      NET_DECNET_TIME_WAIT,           "time_wait" },
+        { CTL_INT,      NET_DECNET_DN_COUNT,            "dn_count" },
+        { CTL_INT,      NET_DECNET_DI_COUNT,            "di_count" },
+        { CTL_INT,      NET_DECNET_DR_COUNT,            "dr_count" },
+        { CTL_INT,      NET_DECNET_DST_GC_INTERVAL,     "dst_gc_interval" },
+        { CTL_INT,      NET_DECNET_NO_FC_MAX_CWND,      "no_fc_max_cwnd" },
+        { CTL_INT,      NET_DECNET_MEM,         "decnet_mem" },
+        { CTL_INT,      NET_DECNET_RMEM,                "decnet_rmem" },
+        { CTL_INT,      NET_DECNET_WMEM,                "decnet_wmem" },
+        { CTL_INT,      NET_DECNET_DEBUG_LEVEL, "debug" },
+        {}
+};
+static const struct bin_table bin_net_sctp_table[] = {
+        { CTL_INT,      NET_SCTP_RTO_INITIAL,           "rto_initial" },
+        { CTL_INT,      NET_SCTP_RTO_MIN,               "rto_min" },
+        { CTL_INT,      NET_SCTP_RTO_MAX,               "rto_max" },
+        { CTL_INT,      NET_SCTP_RTO_ALPHA,             "rto_alpha_exp_divisor" },
+        { CTL_INT,      NET_SCTP_RTO_BETA,              "rto_beta_exp_divisor" },
+        { CTL_INT,      NET_SCTP_VALID_COOKIE_LIFE,     "valid_cookie_life" },
+        { CTL_INT,      NET_SCTP_ASSOCIATION_MAX_RETRANS,       "association_max_retrans" },
+        { CTL_INT,      NET_SCTP_PATH_MAX_RETRANS,      "path_max_retrans" },
+        { CTL_INT,      NET_SCTP_MAX_INIT_RETRANSMITS,  "max_init_retransmits" },
+        { CTL_INT,      NET_SCTP_HB_INTERVAL,           "hb_interval" },
+        { CTL_INT,      NET_SCTP_PRESERVE_ENABLE,       "cookie_preserve_enable" },
+        { CTL_INT,      NET_SCTP_MAX_BURST,             "max_burst" },
+        { CTL_INT,      NET_SCTP_ADDIP_ENABLE,          "addip_enable" },
+        { CTL_INT,      NET_SCTP_PRSCTP_ENABLE,         "prsctp_enable" },
+        { CTL_INT,      NET_SCTP_SNDBUF_POLICY,         "sndbuf_policy" },
+        { CTL_INT,      NET_SCTP_SACK_TIMEOUT,          "sack_timeout" },
+        { CTL_INT,      NET_SCTP_RCVBUF_POLICY,         "rcvbuf_policy" },
+        {}
+};
+static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
+        { CTL_INT,      NET_LLC2_ACK_TIMEOUT,   "ack" },
+        { CTL_INT,      NET_LLC2_P_TIMEOUT,     "p" },
+        { CTL_INT,      NET_LLC2_REJ_TIMEOUT,   "rej" },
+        { CTL_INT,      NET_LLC2_BUSY_TIMEOUT,  "busy" },
+        {}
+};
+static const struct bin_table bin_net_llc_station_table[] = {
+        { CTL_INT,      NET_LLC_STATION_ACK_TIMEOUT,    "ack_timeout" },
+        {}
+};
+static const struct bin_table bin_net_llc_llc2_table[] = {
+        { CTL_DIR,      NET_LLC2,               "timeout",      bin_net_llc_llc2_timeout_table },
+        {}
+};
+static const struct bin_table bin_net_llc_table[] = {
+        { CTL_DIR,      NET_LLC2,               "llc2",         bin_net_llc_llc2_table },
+        { CTL_DIR,      NET_LLC_STATION,        "station",      bin_net_llc_station_table },
+        {}
+};
+static const struct bin_table bin_net_netfilter_table[] = {
+        { CTL_INT,      NET_NF_CONNTRACK_MAX,                   "nf_conntrack_max" },
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
+        /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */
+        /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
+        /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
+        /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
+        { CTL_INT,      NET_NF_CONNTRACK_BUCKETS,               "nf_conntrack_buckets" },
+        { CTL_INT,      NET_NF_CONNTRACK_LOG_INVALID,           "nf_conntrack_log_invalid" },
+        /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
+        { CTL_INT,      NET_NF_CONNTRACK_TCP_LOOSE,             "nf_conntrack_tcp_loose" },
+        { CTL_INT,      NET_NF_CONNTRACK_TCP_BE_LIBERAL,        "nf_conntrack_tcp_be_liberal" },
+        { CTL_INT,      NET_NF_CONNTRACK_TCP_MAX_RETRANS,       "nf_conntrack_tcp_max_retrans" },
+        /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
+        /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
+        /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
+        /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
+        /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
+        /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
+        /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
+        { CTL_INT,      NET_NF_CONNTRACK_COUNT,                 "nf_conntrack_count" },
+        /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
+        /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
+        { CTL_INT,      NET_NF_CONNTRACK_FRAG6_LOW_THRESH,      "nf_conntrack_frag6_low_thresh" },
+        { CTL_INT,      NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,     "nf_conntrack_frag6_high_thresh" },
+        { CTL_INT,      NET_NF_CONNTRACK_CHECKSUM,              "nf_conntrack_checksum" },
+        {}
+};
+static const struct bin_table bin_net_irda_table[] = {
+        { CTL_INT,      NET_IRDA_DISCOVERY,             "discovery" },
+        { CTL_STR,      NET_IRDA_DEVNAME,               "devname" },
+        { CTL_INT,      NET_IRDA_DEBUG,                 "debug" },
+        { CTL_INT,      NET_IRDA_FAST_POLL,             "fast_poll_increase" },
+        { CTL_INT,      NET_IRDA_DISCOVERY_SLOTS,       "discovery_slots" },
+        { CTL_INT,      NET_IRDA_DISCOVERY_TIMEOUT,     "discovery_timeout" },
+        { CTL_INT,      NET_IRDA_SLOT_TIMEOUT,          "slot_timeout" },
+        { CTL_INT,      NET_IRDA_MAX_BAUD_RATE,         "max_baud_rate" },
+        { CTL_INT,      NET_IRDA_MIN_TX_TURN_TIME,      "min_tx_turn_time" },
+        { CTL_INT,      NET_IRDA_MAX_TX_DATA_SIZE,      "max_tx_data_size" },
+        { CTL_INT,      NET_IRDA_MAX_TX_WINDOW,         "max_tx_window" },
+        { CTL_INT,      NET_IRDA_MAX_NOREPLY_TIME,      "max_noreply_time" },
+        { CTL_INT,      NET_IRDA_WARN_NOREPLY_TIME,     "warn_noreply_time" },
+        { CTL_INT,      NET_IRDA_LAP_KEEPALIVE_TIME,    "lap_keepalive_time" },
+        {}
+};
+static const struct bin_table bin_net_table[] = {
+        { CTL_DIR,      NET_CORE,               "core",         bin_net_core_table },
+        /* NET_ETHER not used */
+        /* NET_802 not used */
+        { CTL_DIR,      NET_UNIX,               "unix",         bin_net_unix_table },
+        { CTL_DIR,      NET_IPV4,               "ipv4",         bin_net_ipv4_table },
+        { CTL_DIR,      NET_IPX,                "ipx",          bin_net_ipx_table },
+        { CTL_DIR,      NET_ATALK,              "appletalk",    bin_net_atalk_table },
+        { CTL_DIR,      NET_NETROM,             "netrom",       bin_net_netrom_table },
+        { CTL_DIR,      NET_AX25,               "ax25",         bin_net_ax25_table },
+        /*  NET_BRIDGE "bridge" no longer used */
+        { CTL_DIR,      NET_ROSE,               "rose",         bin_net_rose_table },
+        { CTL_DIR,      NET_IPV6,               "ipv6",         bin_net_ipv6_table },
+        { CTL_DIR,      NET_X25,                "x25",          bin_net_x25_table },
+        { CTL_DIR,      NET_TR,                 "token-ring",   bin_net_tr_table },
+        { CTL_DIR,      NET_DECNET,             "decnet",       bin_net_decnet_table },
+        /*  NET_ECONET not used */
+        { CTL_DIR,      NET_SCTP,               "sctp",         bin_net_sctp_table },
+        { CTL_DIR,      NET_LLC,                "llc",          bin_net_llc_table },
+        { CTL_DIR,      NET_NETFILTER,          "netfilter",    bin_net_netfilter_table },
+        /* NET_DCCP "dccp" no longer used */
+        { CTL_DIR,      NET_IRDA,               "irda",         bin_net_irda_table },
+        { CTL_INT,      2089,                   "nf_conntrack_max" },
+        {}
+};
+static const struct bin_table bin_fs_quota_table[] = {
+        { CTL_INT,      FS_DQ_LOOKUPS,          "lookups" },
+        { CTL_INT,      FS_DQ_DROPS,            "drops" },
+        { CTL_INT,      FS_DQ_READS,            "reads" },
+        { CTL_INT,      FS_DQ_WRITES,           "writes" },
+        { CTL_INT,      FS_DQ_CACHE_HITS,       "cache_hits" },
+        { CTL_INT,      FS_DQ_ALLOCATED,        "allocated_dquots" },
+        { CTL_INT,      FS_DQ_FREE,             "free_dquots" },
+        { CTL_INT,      FS_DQ_SYNCS,            "syncs" },
+        { CTL_INT,      FS_DQ_WARNINGS,         "warnings" },
+        {}
+};
+static const struct bin_table bin_fs_xfs_table[] = {
+        { CTL_INT,      XFS_SGID_INHERIT,       "irix_sgid_inherit" },
+        { CTL_INT,      XFS_SYMLINK_MODE,       "irix_symlink_mode" },
+        { CTL_INT,      XFS_PANIC_MASK,         "panic_mask" },
+        { CTL_INT,      XFS_ERRLEVEL,           "error_level" },
+        { CTL_INT,      XFS_SYNCD_TIMER,        "xfssyncd_centisecs" },
+        { CTL_INT,      XFS_INHERIT_SYNC,       "inherit_sync" },
+        { CTL_INT,      XFS_INHERIT_NODUMP,     "inherit_nodump" },
+        { CTL_INT,      XFS_INHERIT_NOATIME,    "inherit_noatime" },
+        { CTL_INT,      XFS_BUF_TIMER,          "xfsbufd_centisecs" },
+        { CTL_INT,      XFS_BUF_AGE,            "age_buffer_centisecs" },
+        { CTL_INT,      XFS_INHERIT_NOSYM,      "inherit_nosymlinks" },
+        { CTL_INT,      XFS_ROTORSTEP,  "rotorstep" },
+        { CTL_INT,      XFS_INHERIT_NODFRG,     "inherit_nodefrag" },
+        { CTL_INT,      XFS_FILESTREAM_TIMER,   "filestream_centisecs" },
+        { CTL_INT,      XFS_STATS_CLEAR,        "stats_clear" },
+        {}
+};
+static const struct bin_table bin_fs_ocfs2_nm_table[] = {
+        { CTL_STR,      1, "hb_ctl_path" },
+        {}
+};
+static const struct bin_table bin_fs_ocfs2_table[] = {
+        { CTL_DIR,      1,      "nm",   bin_fs_ocfs2_nm_table },
+        {}
+};
+static const struct bin_table bin_inotify_table[] = {
+        { CTL_INT,      INOTIFY_MAX_USER_INSTANCES,     "max_user_instances" },
+        { CTL_INT,      INOTIFY_MAX_USER_WATCHES,       "max_user_watches" },
+        { CTL_INT,      INOTIFY_MAX_QUEUED_EVENTS,      "max_queued_events" },
+        {}
+};
+static const struct bin_table bin_fs_table[] = {
+        { CTL_INT,      FS_NRINODE,             "inode-nr" },
+        { CTL_INT,      FS_STATINODE,           "inode-state" },
+        /* FS_MAXINODE unused */
+        /* FS_NRDQUOT unused */
+        /* FS_MAXDQUOT unused */
+        /* FS_NRFILE "file-nr" no longer used */
+        { CTL_INT,      FS_MAXFILE,             "file-max" },
+        { CTL_INT,      FS_DENTRY,              "dentry-state" },
+        /* FS_NRSUPER unused */
+        /* FS_MAXUPSER unused */
+        { CTL_INT,      FS_OVERFLOWUID,         "overflowuid" },
+        { CTL_INT,      FS_OVERFLOWGID,         "overflowgid" },
+        { CTL_INT,      FS_LEASES,              "leases-enable" },
+        { CTL_INT,      FS_DIR_NOTIFY,          "dir-notify-enable" },
+        { CTL_INT,      FS_LEASE_TIME,          "lease-break-time" },
+        { CTL_DIR,      FS_DQSTATS,             "quota",        bin_fs_quota_table },
+        { CTL_DIR,      FS_XFS,                 "xfs",          bin_fs_xfs_table },
+        { CTL_ULONG,    FS_AIO_NR,              "aio-nr" },
+        { CTL_ULONG,    FS_AIO_MAX_NR,          "aio-max-nr" },
+        { CTL_DIR,      FS_INOTIFY,             "inotify",      bin_inotify_table },
+        { CTL_DIR,      FS_OCFS2,               "ocfs2",        bin_fs_ocfs2_table },
+        { CTL_INT,      KERN_SETUID_DUMPABLE,   "suid_dumpable" },
+        {}
+};
+static const struct bin_table bin_ipmi_table[] = {
+        { CTL_INT,      DEV_IPMI_POWEROFF_POWERCYCLE,   "poweroff_powercycle" },
+        {}
+};
+static const struct bin_table bin_mac_hid_files[] = {
+        /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
+        /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
+        { CTL_INT,      DEV_MAC_HID_MOUSE_BUTTON_EMULATION,     "mouse_button_emulation" },
+        { CTL_INT,      DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE,      "mouse_button2_keycode" },
+        { CTL_INT,      DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE,      "mouse_button3_keycode" },
+        /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
+        {}
+};
+static const struct bin_table bin_raid_table[] = {
+        { CTL_INT,      DEV_RAID_SPEED_LIMIT_MIN,       "speed_limit_min" },
+        { CTL_INT,      DEV_RAID_SPEED_LIMIT_MAX,       "speed_limit_max" },
+        {}
+};
+static const struct bin_table bin_scsi_table[] = {
+        { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
+        {}
+};
+static const struct bin_table bin_dev_table[] = {
+        /* DEV_CDROM    "cdrom" no longer used */
+        /* DEV_HWMON unused */
+        /* DEV_PARPORT  "parport" no longer used */
+        { CTL_DIR,      DEV_RAID,       "raid",         bin_raid_table },
+        { CTL_DIR,      DEV_MAC_HID,    "mac_hid",      bin_mac_hid_files },
+        { CTL_DIR,      DEV_SCSI,       "scsi",         bin_scsi_table },
+        { CTL_DIR,      DEV_IPMI,       "ipmi",         bin_ipmi_table },
+        {}
+};
+static const struct bin_table bin_bus_isa_table[] = {
+        { CTL_INT,      BUS_ISA_MEM_BASE,       "membase" },
+        { CTL_INT,      BUS_ISA_PORT_BASE,      "portbase" },
+        { CTL_INT,      BUS_ISA_PORT_SHIFT,     "portshift" },
+        {}
+};
+static const struct bin_table bin_bus_table[] = {
+        { CTL_DIR,      CTL_BUS_ISA,    "isa",  bin_bus_isa_table },
+        {}
+};
+static const struct bin_table bin_s390dbf_table[] = {
+        { CTL_INT,      5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
+        { CTL_INT,      5679 /* CTL_S390DBF_ACTIVE */,    "debug_active" },
+        {}
+};
+static const struct bin_table bin_sunrpc_table[] = {
+        /* CTL_RPCDEBUG "rpc_debug"  no longer used */
+        /* CTL_NFSDEBUG "nfs_debug"  no longer used */
+        /* CTL_NFSDDEBUG "nfsd_debug" no longer used  */
+        /* CTL_NLMDEBUG "nlm_debug" no longer used */
+        { CTL_INT,      CTL_SLOTTABLE_UDP,      "udp_slot_table_entries" },
+        { CTL_INT,      CTL_SLOTTABLE_TCP,      "tcp_slot_table_entries" },
+        { CTL_INT,      CTL_MIN_RESVPORT,       "min_resvport" },
+        { CTL_INT,      CTL_MAX_RESVPORT,       "max_resvport" },
+        {}
+};
+static const struct bin_table bin_pm_table[] = {
+        /* frv specific */
+        /* 1 == CTL_PM_SUSPEND  "suspend"  no longer used" */
+        { CTL_INT,      2 /* CTL_PM_CMODE */,           "cmode" },
+        { CTL_INT,      3 /* CTL_PM_P0 */,              "p0" },
+        { CTL_INT,      4 /* CTL_PM_CM */,              "cm" },
+        {}
+};
+static const struct bin_table bin_root_table[] = {
+        { CTL_DIR,      CTL_KERN,       "kernel",       bin_kern_table },
+        { CTL_DIR,      CTL_VM,         "vm",           bin_vm_table },
+        { CTL_DIR,      CTL_NET,        "net",          bin_net_table },
+        /* CTL_PROC not used */
+        { CTL_DIR,      CTL_FS,         "fs",           bin_fs_table },
+        /* CTL_DEBUG "debug" no longer used */
+        { CTL_DIR,      CTL_DEV,        "dev",          bin_dev_table },
+        { CTL_DIR,      CTL_BUS,        "bus",          bin_bus_table },
+        { CTL_DIR,      CTL_ABI,        "abi" },
+        /* CTL_CPU not used */
+        /* CTL_ARLAN "arlan" no longer used */
+        { CTL_DIR,      CTL_S390DBF,    "s390dbf",      bin_s390dbf_table },
+        { CTL_DIR,      CTL_SUNRPC,     "sunrpc",       bin_sunrpc_table },
+        { CTL_DIR,      CTL_PM,         "pm",           bin_pm_table },
+        {}
+};
+static ssize_t bin_dir(struct file *file,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        return -ENOTDIR;
+}
+static ssize_t bin_string(struct file *file,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        ssize_t result, copied = 0;
+        if (oldval && oldlen) {
+                char __user *lastp;
+                loff_t pos = 0;
+                int ch;
+                result = vfs_read(file, oldval, oldlen, &pos);
+                if (result < 0)
+                        goto out;
+                copied = result;
+                lastp = oldval + copied - 1;
+                result = -EFAULT;
+                if (get_user(ch, lastp))
+                        goto out;
+                /* Trim off the trailing newline */
+                if (ch == '\n') {
+                        result = -EFAULT;
+                        if (put_user('\0', lastp))
+                                goto out;
+                        copied -= 1;
+                }
+        }
+        if (newval && newlen) {
+                loff_t pos = 0;
+                result = vfs_write(file, newval, newlen, &pos);
+                if (result < 0)
+                        goto out;
+        }
+        result = copied;
+out:
+        return result;
+}
+static ssize_t bin_intvec(struct file *file,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        mm_segment_t old_fs = get_fs();
+        ssize_t copied = 0;
+        char *buffer;
+        ssize_t result;
+        result = -ENOMEM;
+        buffer = kmalloc(BUFSZ, GFP_KERNEL);
+        if (!buffer)
+                goto out;
+        if (oldval && oldlen) {
+                unsigned __user *vec = oldval;
+                size_t length = oldlen / sizeof(*vec);
+                loff_t pos = 0;
+                char *str, *end;
+                int i;
+                set_fs(KERNEL_DS);
+                result = vfs_read(file, buffer, BUFSZ - 1, &pos);
+                set_fs(old_fs);
+                if (result < 0)
+                        goto out_kfree;
+                str = buffer;
+                end = str + result;
+                *end++ = '\0';
+                for (i = 0; i < length; i++) {
+                        unsigned long value;
+                        value = simple_strtoul(str, &str, 10);
+                        while (isspace(*str))
+                                str++;
+                        
+                        result = -EFAULT;
+                        if (put_user(value, vec + i))
+                                goto out_kfree;
+                        copied += sizeof(*vec);
+                        if (!isdigit(*str))
+                                break;
+                }
+        }
+        if (newval && newlen) {
+                unsigned __user *vec = newval;
+                size_t length = newlen / sizeof(*vec);
+                loff_t pos = 0;
+                char *str, *end;
+                int i;
+                str = buffer;
+                end = str + BUFSZ;
+                for (i = 0; i < length; i++) {
+                        unsigned long value;
+                        result = -EFAULT;
+                        if (get_user(value, vec + i))
+                                goto out_kfree;
+                        str += snprintf(str, end - str, "%lu\t", value);
+                }
+                set_fs(KERNEL_DS);
+                result = vfs_write(file, buffer, str - buffer, &pos);
+                set_fs(old_fs);
+                if (result < 0)
+                        goto out_kfree;
+        }
+        result = copied;
+out_kfree:
+        kfree(buffer);
+out:
+        return result;
+}
+static ssize_t bin_ulongvec(struct file *file,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        mm_segment_t old_fs = get_fs();
+        ssize_t copied = 0;
+        char *buffer;
+        ssize_t result;
+        result = -ENOMEM;
+        buffer = kmalloc(BUFSZ, GFP_KERNEL);
+        if (!buffer)
+                goto out;
+        if (oldval && oldlen) {
+                unsigned long __user *vec = oldval;
+                size_t length = oldlen / sizeof(*vec);
+                loff_t pos = 0;
+                char *str, *end;
+                int i;
+                set_fs(KERNEL_DS);
+                result = vfs_read(file, buffer, BUFSZ - 1, &pos);
+                set_fs(old_fs);
+                if (result < 0)
+                        goto out_kfree;
+                str = buffer;
+                end = str + result;
+                *end++ = '\0';
+                for (i = 0; i < length; i++) {
+                        unsigned long value;
+                        value = simple_strtoul(str, &str, 10);
+                        while (isspace(*str))
+                                str++;
+                        
+                        result = -EFAULT;
+                        if (put_user(value, vec + i))
+                                goto out_kfree;
+                        copied += sizeof(*vec);
+                        if (!isdigit(*str))
+                                break;
+                }
+        }
+        if (newval && newlen) {
+                unsigned long __user *vec = newval;
+                size_t length = newlen / sizeof(*vec);
+                loff_t pos = 0;
+                char *str, *end;
+                int i;
+                str = buffer;
+                end = str + BUFSZ;
+                for (i = 0; i < length; i++) {
+                        unsigned long value;
+                        result = -EFAULT;
+                        if (get_user(value, vec + i))
+                                goto out_kfree;
+                        str += snprintf(str, end - str, "%lu\t", value);
+                }
+                set_fs(KERNEL_DS);
+                result = vfs_write(file, buffer, str - buffer, &pos);
+                set_fs(old_fs);
+                if (result < 0)
+                        goto out_kfree;
+        }
+        result = copied;
+out_kfree:
+        kfree(buffer);
+out:
+        return result;
+}
+static unsigned hex_value(int ch)
+{
+        return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
+}
+static ssize_t bin_uuid(struct file *file,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        mm_segment_t old_fs = get_fs();
+        ssize_t result, copied = 0;
+        /* Only supports reads */
+        if (oldval && oldlen) {
+                loff_t pos = 0;
+                char buf[40], *str = buf;
+                unsigned char uuid[16];
+                int i;
+                set_fs(KERNEL_DS);
+                result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
+                set_fs(old_fs);
+                if (result < 0)
+                        goto out;
+                buf[result] = '\0';
+                /* Convert the uuid to from a string to binary */
+                for (i = 0; i < 16; i++) {
+                        result = -EIO;
+                        if (!isxdigit(str[0]) || !isxdigit(str[1]))
+                                goto out;
+                        uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]);
+                        str += 2;
+                        if (*str == '-')
+                                str++;
+                }
+                if (oldlen > 16)
+                        oldlen = 16;
+                result = -EFAULT;
+                if (copy_to_user(oldval, uuid, oldlen))
+                        goto out;
+                copied = oldlen;
+        }
+        result = copied;
+out:
+        return result;
+}
+static ssize_t bin_dn_node_address(struct file *file,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        mm_segment_t old_fs = get_fs();
+        ssize_t result, copied = 0;
+        if (oldval && oldlen) {
+                loff_t pos = 0;
+                char buf[15], *nodep;
+                unsigned long area, node;
+                __le16 dnaddr;
+                set_fs(KERNEL_DS);
+                result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
+                set_fs(old_fs);
+                if (result < 0)
+                        goto out;
+                buf[result] = '\0';
+                /* Convert the decnet addresss to binary */
+                result = -EIO;
+                nodep = strchr(buf, '.') + 1;
+                if (!nodep)
+                        goto out;
+                area = simple_strtoul(buf, NULL, 10);
+                node = simple_strtoul(nodep, NULL, 10);
+                result = -EIO;
+                if ((area > 63)||(node > 1023))
+                        goto out;
+                dnaddr = cpu_to_le16((area << 10) | node);
+                result = -EFAULT;
+                if (put_user(dnaddr, (__le16 __user *)oldval))
+                        goto out;
+                copied = sizeof(dnaddr);
+        }
+        if (newval && newlen) {
+                loff_t pos = 0;
+                __le16 dnaddr;
+                char buf[15];
+                int len;
+                result = -EINVAL;
+                if (newlen != sizeof(dnaddr))
+                        goto out;
+                result = -EFAULT;
+                if (get_user(dnaddr, (__le16 __user *)newval))
+                        goto out;
+                len = snprintf(buf, sizeof(buf), "%hu.%hu",
+                                le16_to_cpu(dnaddr) >> 10,
+                                le16_to_cpu(dnaddr) & 0x3ff);
+                set_fs(KERNEL_DS);
+                result = vfs_write(file, buf, len, &pos);
+                set_fs(old_fs);
+                if (result < 0)
+                        goto out;
+        }
+        result = copied;
+out:
+        return result;
+}
+static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
+{
+        const struct bin_table *table = &bin_root_table[0];
+        int ctl_name;
+        /* The binary sysctl tables have a small maximum depth so
+         * there is no danger of overflowing our path as it PATH_MAX
+         * bytes long.
+         */
+        memcpy(path, "sys/", 4);
+        path += 4;
+repeat:
+        if (!nlen)
+                return ERR_PTR(-ENOTDIR);
+        ctl_name = *name;
+        name++;
+        nlen--;
+        for ( ; table->convert; table++) {
+                int len = 0;
+                /*
+                 * For a wild card entry map from ifindex to network
+                 * device name.
+                 */
+                if (!table->ctl_name) {
+#ifdef CONFIG_NET
+                        struct net *net = current->nsproxy->net_ns;
+                        struct net_device *dev;
+                        dev = dev_get_by_index(net, ctl_name);
+                        if (dev) {
+                                len = strlen(dev->name);
+                                memcpy(path, dev->name, len);
+                                dev_put(dev);
+                        }
+#endif
+                /* Use the well known sysctl number to proc name mapping */
+                } else if (ctl_name == table->ctl_name) {
+                        len = strlen(table->procname);
+                        memcpy(path, table->procname, len);
+                }
+                if (len) {
+                        path += len;
+                        if (table->child) {
+                                *path++ = '/';
+                                table = table->child;
+                                goto repeat;
+                        }
+                        *path = '\0';
+                        return table;
+                }
+        }
+        return ERR_PTR(-ENOTDIR);
+}
+static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
+{
+        char *tmp, *result;
+        result = ERR_PTR(-ENOMEM);
+        tmp = __getname();
+        if (tmp) {
+                const struct bin_table *table = get_sysctl(name, nlen, tmp);
+                result = tmp;
+                *tablep = table;
+                if (IS_ERR(table)) {
+                        __putname(tmp);
+                        result = ERR_CAST(table);
+                }
+        }
+        return result;
+}
+static ssize_t binary_sysctl(const int *name, int nlen,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        const struct bin_table *table = NULL;
+        struct nameidata nd;
+        struct vfsmount *mnt;
+        struct file *file;
+        ssize_t result;
+        char *pathname;
+        int flags;
+        int acc_mode;
+        pathname = sysctl_getname(name, nlen, &table);
+        result = PTR_ERR(pathname);
+        if (IS_ERR(pathname))
+                goto out;
+        /* How should the sysctl be accessed? */
+        if (oldval && oldlen && newval && newlen) {
+                flags = O_RDWR;
+                acc_mode = MAY_READ | MAY_WRITE;
+        } else if (newval && newlen) {
+                flags = O_WRONLY;
+                acc_mode = MAY_WRITE;
+        } else if (oldval && oldlen) {
+                flags = O_RDONLY;
+                acc_mode = MAY_READ;
+        } else {
+                result = 0;
+                goto out_putname;
+        }
+        mnt = current->nsproxy->pid_ns->proc_mnt;
+        result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
+        if (result)
+                goto out_putname;
+        result = may_open(&nd.path, acc_mode, flags);
+        if (result)
+                goto out_putpath;
+        file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+        result = PTR_ERR(file);
+        if (IS_ERR(file))
+                goto out_putname;
+        result = table->convert(file, oldval, oldlen, newval, newlen);
+        fput(file);
+out_putname:
+        putname(pathname);
+out:
+        return result;
+out_putpath:
+        path_put(&nd.path);
+        goto out_putname;
+}
+#else /* CONFIG_SYSCTL_SYSCALL */
+static ssize_t binary_sysctl(const int *name, int nlen,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        return -ENOSYS;
+}
+#endif /* CONFIG_SYSCTL_SYSCALL */
+static void deprecated_sysctl_warning(const int *name, int nlen)
+{
+        int i;
+        /*
+         * CTL_KERN/KERN_VERSION is used by older glibc and cannot
+         * ever go away.
+         */
+        if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
+                return;
+        if (printk_ratelimit()) {
+                printk(KERN_INFO
+                        "warning: process `%s' used the deprecated sysctl "
+                        "system call with ", current->comm);
+                for (i = 0; i < nlen; i++)
+                        printk("%d.", name[i]);
+                printk("\n");
+        }
+        return;
+}
+#define WARN_ONCE_HASH_BITS 8
+#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
+static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
+#define FNV32_OFFSET 2166136261U
+#define FNV32_PRIME 0x01000193
+/*
+ * Print each legacy sysctl (approximately) only once.
+ * To avoid making the tables non-const use a external
+ * hash-table instead.
+ * Worst case hash collision: 6, but very rarely.
+ * NOTE! We don't use the SMP-safe bit tests. We simply
+ * don't care enough.
+ */
+static void warn_on_bintable(const int *name, int nlen)
+{
+        int i;
+        u32 hash = FNV32_OFFSET;
+        for (i = 0; i < nlen; i++)
+                hash = (hash ^ name[i]) * FNV32_PRIME;
+        hash %= WARN_ONCE_HASH_SIZE;
+        if (__test_and_set_bit(hash, warn_once_bitmap))
+                return;
+        deprecated_sysctl_warning(name, nlen);
+}
+static ssize_t do_sysctl(int __user *args_name, int nlen,
+        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
+{
+        int name[CTL_MAXNAME];
+        int i;
+        /* Check args->nlen. */
+        if (nlen < 0 || nlen > CTL_MAXNAME)
+                return -ENOTDIR;
+        /* Read in the sysctl name for simplicity */
+        for (i = 0; i < nlen; i++)
+                if (get_user(name[i], args_name + i))
+                        return -EFAULT;
+        warn_on_bintable(name, nlen);
+        return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
+}
+SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
+{
+        struct __sysctl_args tmp;
+        size_t oldlen = 0;
+        ssize_t result;
+        if (copy_from_user(&tmp, args, sizeof(tmp)))
+                return -EFAULT;
+        if (tmp.oldval && !tmp.oldlenp)
+                return -EFAULT;
+        if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
+                return -EFAULT;
+        result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
+                           tmp.newval, tmp.newlen);
+        if (result >= 0) {
+                oldlen = result;
+                result = 0;
+        }
+        if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
+                return -EFAULT;
+        return result;
+}
+#ifdef CONFIG_COMPAT
+#include <asm/compat.h>
+struct compat_sysctl_args {
+        compat_uptr_t   name;
+        int             nlen;
+        compat_uptr_t   oldval;
+        compat_uptr_t   oldlenp;
+        compat_uptr_t   newval;
+        compat_size_t   newlen;
+        compat_ulong_t  __unused[4];
+};
+asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
+{
+        struct compat_sysctl_args tmp;
+        compat_size_t __user *compat_oldlenp;
+        size_t oldlen = 0;
+        ssize_t result;
+        if (copy_from_user(&tmp, args, sizeof(tmp)))
+                return -EFAULT;
+        if (tmp.oldval && !tmp.oldlenp)
+                return -EFAULT;
+        compat_oldlenp = compat_ptr(tmp.oldlenp);
+        if (compat_oldlenp && get_user(oldlen, compat_oldlenp))
+                return -EFAULT;
+        result = do_sysctl(compat_ptr(tmp.name), tmp.nlen,
+                           compat_ptr(tmp.oldval), oldlen,
+                           compat_ptr(tmp.newval), tmp.newlen);
+        if (result >= 0) {
+                oldlen = result;
+                result = 0;
+        }
+        if (compat_oldlenp && put_user(oldlen, compat_oldlenp))
+                return -EFAULT;
+        return result;
+}
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b6e7aaea4604..04cdcf72c827 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -5,1239 +5,6 @@
 #include <linux/string.h>
 #include <net/ip_vs.h>
-struct trans_ctl_table {
-        int                     ctl_name;
-        const char              *procname;
-        const struct trans_ctl_table *child;
-};
-static const struct trans_ctl_table trans_random_table[] = {
-        { RANDOM_POOLSIZE,      "poolsize" },
-        { RANDOM_ENTROPY_COUNT, "entropy_avail" },
-        { RANDOM_READ_THRESH,   "read_wakeup_threshold" },
-        { RANDOM_WRITE_THRESH,  "write_wakeup_threshold" },
-        { RANDOM_BOOT_ID,       "boot_id" },
-        { RANDOM_UUID,          "uuid" },
-        {}
-};
-static const struct trans_ctl_table trans_pty_table[] = {
-        { PTY_MAX,              "max" },
-        { PTY_NR,               "nr" },
-        {}
-};
-static const struct trans_ctl_table trans_kern_table[] = {
-        { KERN_OSTYPE,                  "ostype" },
-        { KERN_OSRELEASE,               "osrelease" },
-        /* KERN_OSREV not used */
-        { KERN_VERSION,                 "version" },
-        /* KERN_SECUREMASK not used */
-        /* KERN_PROF not used */
-        { KERN_NODENAME,                "hostname" },
-        { KERN_DOMAINNAME,              "domainname" },
-        { KERN_PANIC,                   "panic" },
-        { KERN_REALROOTDEV,             "real-root-dev" },
-        { KERN_SPARC_REBOOT,            "reboot-cmd" },
-        { KERN_CTLALTDEL,               "ctrl-alt-del" },
-        { KERN_PRINTK,                  "printk" },
-        /* KERN_NAMETRANS not used */
-        /* KERN_PPC_HTABRECLAIM not used */
-        /* KERN_PPC_ZEROPAGED not used */
-        { KERN_PPC_POWERSAVE_NAP,       "powersave-nap" },
-        { KERN_MODPROBE,                "modprobe" },
-        { KERN_SG_BIG_BUFF,             "sg-big-buff" },
-        { KERN_ACCT,                    "acct" },
-        { KERN_PPC_L2CR,                "l2cr" },
-        /* KERN_RTSIGNR not used */
-        /* KERN_RTSIGMAX not used */
-        { KERN_SHMMAX,                  "shmmax" },
-        { KERN_MSGMAX,                  "msgmax" },
-        { KERN_MSGMNB,                  "msgmnb" },
-        /* KERN_MSGPOOL not used*/
-        { KERN_SYSRQ,                   "sysrq" },
-        { KERN_MAX_THREADS,             "threads-max" },
-        { KERN_RANDOM,                  "random",       trans_random_table },
-        { KERN_SHMALL,                  "shmall" },
-        { KERN_MSGMNI,                  "msgmni" },
-        { KERN_SEM,                     "sem" },
-        { KERN_SPARC_STOP_A,            "stop-a" },
-        { KERN_SHMMNI,                  "shmmni" },
-        { KERN_OVERFLOWUID,             "overflowuid" },
-        { KERN_OVERFLOWGID,             "overflowgid" },
-        { KERN_HOTPLUG,                 "hotplug", },
-        { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" },
-        { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" },
-        { KERN_CORE_USES_PID,           "core_uses_pid" },
-        { KERN_TAINTED,                 "tainted" },
-        { KERN_CADPID,                  "cad_pid" },
-        { KERN_PIDMAX,                  "pid_max" },
-        { KERN_CORE_PATTERN,            "core_pattern" },
-        { KERN_PANIC_ON_OOPS,           "panic_on_oops" },
-        { KERN_HPPA_PWRSW,              "soft-power" },
-        { KERN_HPPA_UNALIGNED,          "unaligned-trap" },
-        { KERN_PRINTK_RATELIMIT,        "printk_ratelimit" },
-        { KERN_PRINTK_RATELIMIT_BURST,  "printk_ratelimit_burst" },
-        { KERN_PTY,                     "pty",          trans_pty_table },
-        { KERN_NGROUPS_MAX,             "ngroups_max" },
-        { KERN_SPARC_SCONS_PWROFF,      "scons-poweroff" },
-        { KERN_HZ_TIMER,                "hz_timer" },
-        { KERN_UNKNOWN_NMI_PANIC,       "unknown_nmi_panic" },
-        { KERN_BOOTLOADER_TYPE,         "bootloader_type" },
-        { KERN_RANDOMIZE,               "randomize_va_space" },
-        { KERN_SPIN_RETRY,              "spin_retry" },
-        { KERN_ACPI_VIDEO_FLAGS,        "acpi_video_flags" },
-        { KERN_IA64_UNALIGNED,          "ignore-unaligned-usertrap" },
-        { KERN_COMPAT_LOG,              "compat-log" },
-        { KERN_MAX_LOCK_DEPTH,          "max_lock_depth" },
-        { KERN_NMI_WATCHDOG,            "nmi_watchdog" },
-        { KERN_PANIC_ON_NMI,            "panic_on_unrecovered_nmi" },
-        {}
-};
-static const struct trans_ctl_table trans_vm_table[] = {
-        { VM_OVERCOMMIT_MEMORY,         "overcommit_memory" },
-        { VM_PAGE_CLUSTER,              "page-cluster" },
-        { VM_DIRTY_BACKGROUND,          "dirty_background_ratio" },
-        { VM_DIRTY_RATIO,               "dirty_ratio" },
-        { VM_DIRTY_WB_CS,               "dirty_writeback_centisecs" },
-        { VM_DIRTY_EXPIRE_CS,           "dirty_expire_centisecs" },
-        { VM_NR_PDFLUSH_THREADS,        "nr_pdflush_threads" },
-        { VM_OVERCOMMIT_RATIO,          "overcommit_ratio" },
-        /* VM_PAGEBUF unused */
-        { VM_HUGETLB_PAGES,             "nr_hugepages" },
-        { VM_SWAPPINESS,                "swappiness" },
-        { VM_LOWMEM_RESERVE_RATIO,      "lowmem_reserve_ratio" },
-        { VM_MIN_FREE_KBYTES,           "min_free_kbytes" },
-        { VM_MAX_MAP_COUNT,             "max_map_count" },
-        { VM_LAPTOP_MODE,               "laptop_mode" },
-        { VM_BLOCK_DUMP,                "block_dump" },
-        { VM_HUGETLB_GROUP,             "hugetlb_shm_group" },
-        { VM_VFS_CACHE_PRESSURE,        "vfs_cache_pressure" },
-        { VM_LEGACY_VA_LAYOUT,          "legacy_va_layout" },
-        /* VM_SWAP_TOKEN_TIMEOUT unused */
-        { VM_DROP_PAGECACHE,            "drop_caches" },
-        { VM_PERCPU_PAGELIST_FRACTION,  "percpu_pagelist_fraction" },
-        { VM_ZONE_RECLAIM_MODE,         "zone_reclaim_mode" },
-        { VM_MIN_UNMAPPED,              "min_unmapped_ratio" },
-        { VM_PANIC_ON_OOM,              "panic_on_oom" },
-        { VM_VDSO_ENABLED,              "vdso_enabled" },
-        { VM_MIN_SLAB,                  "min_slab_ratio" },
-        {}
-};
-static const struct trans_ctl_table trans_net_core_table[] = {
-        { NET_CORE_WMEM_MAX,            "wmem_max" },
-        { NET_CORE_RMEM_MAX,            "rmem_max" },
-        { NET_CORE_WMEM_DEFAULT,        "wmem_default" },
-        { NET_CORE_RMEM_DEFAULT,        "rmem_default" },
-        /* NET_CORE_DESTROY_DELAY unused */
-        { NET_CORE_MAX_BACKLOG,         "netdev_max_backlog" },
-        /* NET_CORE_FASTROUTE unused */
-        { NET_CORE_MSG_COST,            "message_cost" },
-        { NET_CORE_MSG_BURST,           "message_burst" },
-        { NET_CORE_OPTMEM_MAX,          "optmem_max" },
-        /* NET_CORE_HOT_LIST_LENGTH unused */
-        /* NET_CORE_DIVERT_VERSION unused */
-        /* NET_CORE_NO_CONG_THRESH unused */
-        /* NET_CORE_NO_CONG unused */
-        /* NET_CORE_LO_CONG unused */
-        /* NET_CORE_MOD_CONG unused */
-        { NET_CORE_DEV_WEIGHT,          "dev_weight" },
-        { NET_CORE_SOMAXCONN,           "somaxconn" },
-        { NET_CORE_BUDGET,              "netdev_budget" },
-        { NET_CORE_AEVENT_ETIME,        "xfrm_aevent_etime" },
-        { NET_CORE_AEVENT_RSEQTH,       "xfrm_aevent_rseqth" },
-        { NET_CORE_WARNINGS,            "warnings" },
-        {},
-};
-static const struct trans_ctl_table trans_net_unix_table[] = {
-        /* NET_UNIX_DESTROY_DELAY unused */
-        /* NET_UNIX_DELETE_DELAY unused */
-        { NET_UNIX_MAX_DGRAM_QLEN,      "max_dgram_qlen" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
-        { NET_IPV4_ROUTE_FLUSH,                 "flush" },
-        { NET_IPV4_ROUTE_MIN_DELAY,             "min_delay" },
-        { NET_IPV4_ROUTE_MAX_DELAY,             "max_delay" },
-        { NET_IPV4_ROUTE_GC_THRESH,             "gc_thresh" },
-        { NET_IPV4_ROUTE_MAX_SIZE,              "max_size" },
-        { NET_IPV4_ROUTE_GC_MIN_INTERVAL,       "gc_min_interval" },
-        { NET_IPV4_ROUTE_GC_TIMEOUT,            "gc_timeout" },
-        { NET_IPV4_ROUTE_GC_INTERVAL,           "gc_interval" },
-        { NET_IPV4_ROUTE_REDIRECT_LOAD,         "redirect_load" },
-        { NET_IPV4_ROUTE_REDIRECT_NUMBER,       "redirect_number" },
-        { NET_IPV4_ROUTE_REDIRECT_SILENCE,      "redirect_silence" },
-        { NET_IPV4_ROUTE_ERROR_COST,            "error_cost" },
-        { NET_IPV4_ROUTE_ERROR_BURST,           "error_burst" },
-        { NET_IPV4_ROUTE_GC_ELASTICITY,         "gc_elasticity" },
-        { NET_IPV4_ROUTE_MTU_EXPIRES,           "mtu_expires" },
-        { NET_IPV4_ROUTE_MIN_PMTU,              "min_pmtu" },
-        { NET_IPV4_ROUTE_MIN_ADVMSS,            "min_adv_mss" },
-        { NET_IPV4_ROUTE_SECRET_INTERVAL,       "secret_interval" },
-        { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,    "gc_min_interval_ms" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
-        { NET_IPV4_CONF_FORWARDING,             "forwarding" },
-        { NET_IPV4_CONF_MC_FORWARDING,          "mc_forwarding" },
-        { NET_IPV4_CONF_PROXY_ARP,              "proxy_arp" },
-        { NET_IPV4_CONF_ACCEPT_REDIRECTS,       "accept_redirects" },
-        { NET_IPV4_CONF_SECURE_REDIRECTS,       "secure_redirects" },
-        { NET_IPV4_CONF_SEND_REDIRECTS,         "send_redirects" },
-        { NET_IPV4_CONF_SHARED_MEDIA,           "shared_media" },
-        { NET_IPV4_CONF_RP_FILTER,              "rp_filter" },
-        { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,    "accept_source_route" },
-        { NET_IPV4_CONF_BOOTP_RELAY,            "bootp_relay" },
-        { NET_IPV4_CONF_LOG_MARTIANS,           "log_martians" },
-        { NET_IPV4_CONF_TAG,                    "tag" },
-        { NET_IPV4_CONF_ARPFILTER,              "arp_filter" },
-        { NET_IPV4_CONF_MEDIUM_ID,              "medium_id" },
-        { NET_IPV4_CONF_NOXFRM,                 "disable_xfrm" },
-        { NET_IPV4_CONF_NOPOLICY,               "disable_policy" },
-        { NET_IPV4_CONF_FORCE_IGMP_VERSION,     "force_igmp_version" },
-        { NET_IPV4_CONF_ARP_ANNOUNCE,           "arp_announce" },
-        { NET_IPV4_CONF_ARP_IGNORE,             "arp_ignore" },
-        { NET_IPV4_CONF_PROMOTE_SECONDARIES,    "promote_secondaries" },
-        { NET_IPV4_CONF_ARP_ACCEPT,             "arp_accept" },
-        { NET_IPV4_CONF_ARP_NOTIFY,             "arp_notify" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
-        { NET_PROTO_CONF_ALL,           "all",          trans_net_ipv4_conf_vars_table },
-        { NET_PROTO_CONF_DEFAULT,       "default",      trans_net_ipv4_conf_vars_table },
-        { 0, NULL, trans_net_ipv4_conf_vars_table },
-        {}
-};
-static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
-        { NET_NEIGH_MCAST_SOLICIT,      "mcast_solicit" },
-        { NET_NEIGH_UCAST_SOLICIT,      "ucast_solicit" },
-        { NET_NEIGH_APP_SOLICIT,        "app_solicit" },
-        { NET_NEIGH_RETRANS_TIME,       "retrans_time" },
-        { NET_NEIGH_REACHABLE_TIME,     "base_reachable_time" },
-        { NET_NEIGH_DELAY_PROBE_TIME,   "delay_first_probe_time" },
-        { NET_NEIGH_GC_STALE_TIME,      "gc_stale_time" },
-        { NET_NEIGH_UNRES_QLEN,         "unres_qlen" },
-        { NET_NEIGH_PROXY_QLEN,         "proxy_qlen" },
-        { NET_NEIGH_ANYCAST_DELAY,      "anycast_delay" },
-        { NET_NEIGH_PROXY_DELAY,        "proxy_delay" },
-        { NET_NEIGH_LOCKTIME,           "locktime" },
-        { NET_NEIGH_GC_INTERVAL,        "gc_interval" },
-        { NET_NEIGH_GC_THRESH1,         "gc_thresh1" },
-        { NET_NEIGH_GC_THRESH2,         "gc_thresh2" },
-        { NET_NEIGH_GC_THRESH3,         "gc_thresh3" },
-        { NET_NEIGH_RETRANS_TIME_MS,    "retrans_time_ms" },
-        { NET_NEIGH_REACHABLE_TIME_MS,  "base_reachable_time_ms" },
-        {}
-};
-static const struct trans_ctl_table trans_net_neigh_table[] = {
-        { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
-        { 0, NULL, trans_net_neigh_vars_table },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
-        { NET_IPV4_NF_CONNTRACK_MAX,                            "ip_conntrack_max" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,           "ip_conntrack_tcp_timeout_syn_sent" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,           "ip_conntrack_tcp_timeout_syn_recv" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,        "ip_conntrack_tcp_timeout_established" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,           "ip_conntrack_tcp_timeout_fin_wait" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,         "ip_conntrack_tcp_timeout_close_wait" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,           "ip_conntrack_tcp_timeout_last_ack" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,          "ip_conntrack_tcp_timeout_time_wait" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,              "ip_conntrack_tcp_timeout_close" },
-        { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,                    "ip_conntrack_udp_timeout" },
-        { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,             "ip_conntrack_udp_timeout_stream" },
-        { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,                   "ip_conntrack_icmp_timeout" },
-        { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,                "ip_conntrack_generic_timeout" },
-        { NET_IPV4_NF_CONNTRACK_BUCKETS,                        "ip_conntrack_buckets" },
-        { NET_IPV4_NF_CONNTRACK_LOG_INVALID,                    "ip_conntrack_log_invalid" },
-        { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,        "ip_conntrack_tcp_timeout_max_retrans" },
-        { NET_IPV4_NF_CONNTRACK_TCP_LOOSE,                      "ip_conntrack_tcp_loose" },
-        { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,                 "ip_conntrack_tcp_be_liberal" },
-        { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,                "ip_conntrack_tcp_max_retrans" },
-        { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,            "ip_conntrack_sctp_timeout_closed" },
-        { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,       "ip_conntrack_sctp_timeout_cookie_wait" },
-        { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,     "ip_conntrack_sctp_timeout_cookie_echoed" },
-        { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,       "ip_conntrack_sctp_timeout_established" },
-        { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,     "ip_conntrack_sctp_timeout_shutdown_sent" },
-        { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,     "ip_conntrack_sctp_timeout_shutdown_recd" },
-        { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" },
-        { NET_IPV4_NF_CONNTRACK_COUNT,          "ip_conntrack_count" },
-        { NET_IPV4_NF_CONNTRACK_CHECKSUM,       "ip_conntrack_checksum" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv4_table[] = {
-        { NET_IPV4_FORWARD,                     "ip_forward" },
-        { NET_IPV4_DYNADDR,                     "ip_dynaddr" },
-        { NET_IPV4_CONF,                "conf",         trans_net_ipv4_conf_table },
-        { NET_IPV4_NEIGH,               "neigh",        trans_net_neigh_table },
-        { NET_IPV4_ROUTE,               "route",        trans_net_ipv4_route_table },
-        /* NET_IPV4_FIB_HASH unused */
-        { NET_IPV4_NETFILTER,           "netfilter",    trans_net_ipv4_netfilter_table },
-        { NET_IPV4_TCP_TIMESTAMPS,              "tcp_timestamps" },
-        { NET_IPV4_TCP_WINDOW_SCALING,          "tcp_window_scaling" },
-        { NET_IPV4_TCP_SACK,                    "tcp_sack" },
-        { NET_IPV4_TCP_RETRANS_COLLAPSE,        "tcp_retrans_collapse" },
-        { NET_IPV4_DEFAULT_TTL,                 "ip_default_ttl" },
-        /* NET_IPV4_AUTOCONFIG unused */
-        { NET_IPV4_NO_PMTU_DISC,                "ip_no_pmtu_disc" },
-        { NET_IPV4_TCP_SYN_RETRIES,             "tcp_syn_retries" },
-        { NET_IPV4_IPFRAG_HIGH_THRESH,          "ipfrag_high_thresh" },
-        { NET_IPV4_IPFRAG_LOW_THRESH,           "ipfrag_low_thresh" },
-        { NET_IPV4_IPFRAG_TIME,                 "ipfrag_time" },
-        /* NET_IPV4_TCP_MAX_KA_PROBES unused */
-        { NET_IPV4_TCP_KEEPALIVE_TIME,          "tcp_keepalive_time" },
-        { NET_IPV4_TCP_KEEPALIVE_PROBES,        "tcp_keepalive_probes" },
-        { NET_IPV4_TCP_RETRIES1,                "tcp_retries1" },
-        { NET_IPV4_TCP_RETRIES2,                "tcp_retries2" },
-        { NET_IPV4_TCP_FIN_TIMEOUT,             "tcp_fin_timeout" },
-        /* NET_IPV4_IP_MASQ_DEBUG unused */
-        { NET_TCP_SYNCOOKIES,                   "tcp_syncookies" },
-        { NET_TCP_STDURG,                       "tcp_stdurg" },
-        { NET_TCP_RFC1337,                      "tcp_rfc1337" },
-        /* NET_TCP_SYN_TAILDROP unused */
-        { NET_TCP_MAX_SYN_BACKLOG,              "tcp_max_syn_backlog" },
-        { NET_IPV4_LOCAL_PORT_RANGE,            "ip_local_port_range" },
-        { NET_IPV4_ICMP_ECHO_IGNORE_ALL,        "icmp_echo_ignore_all" },
-        { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" },
-        /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
-        /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
-        /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
-        /* NET_IPV4_ICMP_PARAMPROB_RATE unused */
-        /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
-        { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,   "icmp_ignore_bogus_error_responses" },
-        { NET_IPV4_IGMP_MAX_MEMBERSHIPS,        "igmp_max_memberships" },
-        { NET_TCP_TW_RECYCLE,                   "tcp_tw_recycle" },
-        /* NET_IPV4_ALWAYS_DEFRAG unused */
-        { NET_IPV4_TCP_KEEPALIVE_INTVL,         "tcp_keepalive_intvl" },
-        { NET_IPV4_INET_PEER_THRESHOLD,         "inet_peer_threshold" },
-        { NET_IPV4_INET_PEER_MINTTL,            "inet_peer_minttl" },
-        { NET_IPV4_INET_PEER_MAXTTL,            "inet_peer_maxttl" },
-        { NET_IPV4_INET_PEER_GC_MINTIME,        "inet_peer_gc_mintime" },
-        { NET_IPV4_INET_PEER_GC_MAXTIME,        "inet_peer_gc_maxtime" },
-        { NET_TCP_ORPHAN_RETRIES,               "tcp_orphan_retries" },
-        { NET_TCP_ABORT_ON_OVERFLOW,            "tcp_abort_on_overflow" },
-        { NET_TCP_SYNACK_RETRIES,               "tcp_synack_retries" },
-        { NET_TCP_MAX_ORPHANS,                  "tcp_max_orphans" },
-        { NET_TCP_MAX_TW_BUCKETS,               "tcp_max_tw_buckets" },
-        { NET_TCP_FACK,                         "tcp_fack" },
-        { NET_TCP_REORDERING,                   "tcp_reordering" },
-        { NET_TCP_ECN,                          "tcp_ecn" },
-        { NET_TCP_DSACK,                        "tcp_dsack" },
-        { NET_TCP_MEM,                          "tcp_mem" },
-        { NET_TCP_WMEM,                         "tcp_wmem" },
-        { NET_TCP_RMEM,                         "tcp_rmem" },
-        { NET_TCP_APP_WIN,                      "tcp_app_win" },
-        { NET_TCP_ADV_WIN_SCALE,                "tcp_adv_win_scale" },
-        { NET_IPV4_NONLOCAL_BIND,               "ip_nonlocal_bind" },
-        { NET_IPV4_ICMP_RATELIMIT,              "icmp_ratelimit" },
-        { NET_IPV4_ICMP_RATEMASK,               "icmp_ratemask" },
-        { NET_TCP_TW_REUSE,                     "tcp_tw_reuse" },
-        { NET_TCP_FRTO,                         "tcp_frto" },
-        { NET_TCP_LOW_LATENCY,                  "tcp_low_latency" },
-        { NET_IPV4_IPFRAG_SECRET_INTERVAL,      "ipfrag_secret_interval" },
-        { NET_IPV4_IGMP_MAX_MSF,                "igmp_max_msf" },
-        { NET_TCP_NO_METRICS_SAVE,              "tcp_no_metrics_save" },
-        /* NET_TCP_DEFAULT_WIN_SCALE unused */
-        { NET_TCP_MODERATE_RCVBUF,              "tcp_moderate_rcvbuf" },
-        { NET_TCP_TSO_WIN_DIVISOR,              "tcp_tso_win_divisor" },
-        /* NET_TCP_BIC_BETA unused */
-        { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,      "icmp_errors_use_inbound_ifaddr" },
-        { NET_TCP_CONG_CONTROL,                 "tcp_congestion_control" },
-        { NET_TCP_ABC,                          "tcp_abc" },
-        { NET_IPV4_IPFRAG_MAX_DIST,             "ipfrag_max_dist" },
-        { NET_TCP_MTU_PROBING,                  "tcp_mtu_probing" },
-        { NET_TCP_BASE_MSS,                     "tcp_base_mss" },
-        { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,       "tcp_workaround_signed_windows" },
-        { NET_TCP_DMA_COPYBREAK,                "tcp_dma_copybreak" },
-        { NET_TCP_SLOW_START_AFTER_IDLE,        "tcp_slow_start_after_idle" },
-        { NET_CIPSOV4_CACHE_ENABLE,             "cipso_cache_enable" },
-        { NET_CIPSOV4_CACHE_BUCKET_SIZE,        "cipso_cache_bucket_size" },
-        { NET_CIPSOV4_RBM_OPTFMT,               "cipso_rbm_optfmt" },
-        { NET_CIPSOV4_RBM_STRICTVALID,          "cipso_rbm_strictvalid" },
-        { NET_TCP_AVAIL_CONG_CONTROL,           "tcp_available_congestion_control" },
-        { NET_TCP_ALLOWED_CONG_CONTROL,         "tcp_allowed_congestion_control" },
-        { NET_TCP_MAX_SSTHRESH,                 "tcp_max_ssthresh" },
-        { NET_TCP_FRTO_RESPONSE,                "tcp_frto_response" },
-        { 2088 /* NET_IPQ_QMAX */,              "ip_queue_maxlen" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipx_table[] = {
-        { NET_IPX_PPROP_BROADCASTING,   "ipx_pprop_broadcasting" },
-        /* NET_IPX_FORWARDING unused */
-        {}
-};
-static const struct trans_ctl_table trans_net_atalk_table[] = {
-        { NET_ATALK_AARP_EXPIRY_TIME,           "aarp-expiry-time" },
-        { NET_ATALK_AARP_TICK_TIME,             "aarp-tick-time" },
-        { NET_ATALK_AARP_RETRANSMIT_LIMIT,      "aarp-retransmit-limit" },
-        { NET_ATALK_AARP_RESOLVE_TIME,          "aarp-resolve-time" },
-        {},
-};
-static const struct trans_ctl_table trans_net_netrom_table[] = {
-        { NET_NETROM_DEFAULT_PATH_QUALITY,              "default_path_quality" },
-        { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER,    "obsolescence_count_initialiser" },
-        { NET_NETROM_NETWORK_TTL_INITIALISER,           "network_ttl_initialiser" },
-        { NET_NETROM_TRANSPORT_TIMEOUT,                 "transport_timeout" },
-        { NET_NETROM_TRANSPORT_MAXIMUM_TRIES,           "transport_maximum_tries" },
-        { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY,       "transport_acknowledge_delay" },
-        { NET_NETROM_TRANSPORT_BUSY_DELAY,              "transport_busy_delay" },
-        { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE,   "transport_requested_window_size" },
-        { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT,     "transport_no_activity_timeout" },
-        { NET_NETROM_ROUTING_CONTROL,                   "routing_control" },
-        { NET_NETROM_LINK_FAILS_COUNT,                  "link_fails_count" },
-        { NET_NETROM_RESET,                             "reset" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ax25_param_table[] = {
-        { NET_AX25_IP_DEFAULT_MODE,     "ip_default_mode" },
-        { NET_AX25_DEFAULT_MODE,        "ax25_default_mode" },
-        { NET_AX25_BACKOFF_TYPE,        "backoff_type" },
-        { NET_AX25_CONNECT_MODE,        "connect_mode" },
-        { NET_AX25_STANDARD_WINDOW,     "standard_window_size" },
-        { NET_AX25_EXTENDED_WINDOW,     "extended_window_size" },
-        { NET_AX25_T1_TIMEOUT,          "t1_timeout" },
-        { NET_AX25_T2_TIMEOUT,          "t2_timeout" },
-        { NET_AX25_T3_TIMEOUT,          "t3_timeout" },
-        { NET_AX25_IDLE_TIMEOUT,        "idle_timeout" },
-        { NET_AX25_N2,                  "maximum_retry_count" },
-        { NET_AX25_PACLEN,              "maximum_packet_length" },
-        { NET_AX25_PROTOCOL,            "protocol" },
-        { NET_AX25_DAMA_SLAVE_TIMEOUT,  "dama_slave_timeout" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ax25_table[] = {
-        { 0, NULL, trans_net_ax25_param_table },
-        {}
-};
-static const struct trans_ctl_table trans_net_bridge_table[] = {
-        { NET_BRIDGE_NF_CALL_ARPTABLES,         "bridge-nf-call-arptables" },
-        { NET_BRIDGE_NF_CALL_IPTABLES,          "bridge-nf-call-iptables" },
-        { NET_BRIDGE_NF_CALL_IP6TABLES,         "bridge-nf-call-ip6tables" },
-        { NET_BRIDGE_NF_FILTER_VLAN_TAGGED,     "bridge-nf-filter-vlan-tagged" },
-        { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED,    "bridge-nf-filter-pppoe-tagged" },
-        {}
-};
-static const struct trans_ctl_table trans_net_rose_table[] = {
-        { NET_ROSE_RESTART_REQUEST_TIMEOUT,     "restart_request_timeout" },
-        { NET_ROSE_CALL_REQUEST_TIMEOUT,        "call_request_timeout" },
-        { NET_ROSE_RESET_REQUEST_TIMEOUT,       "reset_request_timeout" },
-        { NET_ROSE_CLEAR_REQUEST_TIMEOUT,       "clear_request_timeout" },
-        { NET_ROSE_ACK_HOLD_BACK_TIMEOUT,       "acknowledge_hold_back_timeout" },
-        { NET_ROSE_ROUTING_CONTROL,             "routing_control" },
-        { NET_ROSE_LINK_FAIL_TIMEOUT,           "link_fail_timeout" },
-        { NET_ROSE_MAX_VCS,                     "maximum_virtual_circuits" },
-        { NET_ROSE_WINDOW_SIZE,                 "window_size" },
-        { NET_ROSE_NO_ACTIVITY_TIMEOUT,         "no_activity_timeout" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
-        { NET_IPV6_FORWARDING,                  "forwarding" },
-        { NET_IPV6_HOP_LIMIT,                   "hop_limit" },
-        { NET_IPV6_MTU,                         "mtu" },
-        { NET_IPV6_ACCEPT_RA,                   "accept_ra" },
-        { NET_IPV6_ACCEPT_REDIRECTS,            "accept_redirects" },
-        { NET_IPV6_AUTOCONF,                    "autoconf" },
-        { NET_IPV6_DAD_TRANSMITS,               "dad_transmits" },
-        { NET_IPV6_RTR_SOLICITS,                "router_solicitations" },
-        { NET_IPV6_RTR_SOLICIT_INTERVAL,        "router_solicitation_interval" },
-        { NET_IPV6_RTR_SOLICIT_DELAY,           "router_solicitation_delay" },
-        { NET_IPV6_USE_TEMPADDR,                "use_tempaddr" },
-        { NET_IPV6_TEMP_VALID_LFT,              "temp_valid_lft" },
-        { NET_IPV6_TEMP_PREFERED_LFT,           "temp_prefered_lft" },
-        { NET_IPV6_REGEN_MAX_RETRY,             "regen_max_retry" },
-        { NET_IPV6_MAX_DESYNC_FACTOR,           "max_desync_factor" },
-        { NET_IPV6_MAX_ADDRESSES,               "max_addresses" },
-        { NET_IPV6_FORCE_MLD_VERSION,           "force_mld_version" },
-        { NET_IPV6_ACCEPT_RA_DEFRTR,            "accept_ra_defrtr" },
-        { NET_IPV6_ACCEPT_RA_PINFO,             "accept_ra_pinfo" },
-        { NET_IPV6_ACCEPT_RA_RTR_PREF,          "accept_ra_rtr_pref" },
-        { NET_IPV6_RTR_PROBE_INTERVAL,          "router_probe_interval" },
-        { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,  "accept_ra_rt_info_max_plen" },
-        { NET_IPV6_PROXY_NDP,                   "proxy_ndp" },
-        { NET_IPV6_ACCEPT_SOURCE_ROUTE,         "accept_source_route" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
-        { NET_PROTO_CONF_ALL,           "all",  trans_net_ipv6_conf_var_table },
-        { NET_PROTO_CONF_DEFAULT,       "default", trans_net_ipv6_conf_var_table },
-        { 0, NULL, trans_net_ipv6_conf_var_table },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
-        { NET_IPV6_ROUTE_FLUSH,                 "flush" },
-        { NET_IPV6_ROUTE_GC_THRESH,             "gc_thresh" },
-        { NET_IPV6_ROUTE_MAX_SIZE,              "max_size" },
-        { NET_IPV6_ROUTE_GC_MIN_INTERVAL,       "gc_min_interval" },
-        { NET_IPV6_ROUTE_GC_TIMEOUT,            "gc_timeout" },
-        { NET_IPV6_ROUTE_GC_INTERVAL,           "gc_interval" },
-        { NET_IPV6_ROUTE_GC_ELASTICITY,         "gc_elasticity" },
-        { NET_IPV6_ROUTE_MTU_EXPIRES,           "mtu_expires" },
-        { NET_IPV6_ROUTE_MIN_ADVMSS,            "min_adv_mss" },
-        { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,    "gc_min_interval_ms" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
-        { NET_IPV6_ICMP_RATELIMIT,      "ratelimit" },
-        {}
-};
-static const struct trans_ctl_table trans_net_ipv6_table[] = {
-        { NET_IPV6_CONF,                "conf",         trans_net_ipv6_conf_table },
-        { NET_IPV6_NEIGH,               "neigh",        trans_net_neigh_table },
-        { NET_IPV6_ROUTE,               "route",        trans_net_ipv6_route_table },
-        { NET_IPV6_ICMP,                "icmp",         trans_net_ipv6_icmp_table },
-        { NET_IPV6_BINDV6ONLY,          "bindv6only" },
-        { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" },
-        { NET_IPV6_IP6FRAG_LOW_THRESH,  "ip6frag_low_thresh" },
-        { NET_IPV6_IP6FRAG_TIME,        "ip6frag_time" },
-        { NET_IPV6_IP6FRAG_SECRET_INTERVAL,     "ip6frag_secret_interval" },
-        { NET_IPV6_MLD_MAX_MSF,         "mld_max_msf" },
-        { 2088 /* IPQ_QMAX */,          "ip6_queue_maxlen" },
-        {}
-};
-static const struct trans_ctl_table trans_net_x25_table[] = {
-        { NET_X25_RESTART_REQUEST_TIMEOUT,      "restart_request_timeout" },
-        { NET_X25_CALL_REQUEST_TIMEOUT,         "call_request_timeout" },
-        { NET_X25_RESET_REQUEST_TIMEOUT,        "reset_request_timeout" },
-        { NET_X25_CLEAR_REQUEST_TIMEOUT,        "clear_request_timeout" },
-        { NET_X25_ACK_HOLD_BACK_TIMEOUT,        "acknowledgement_hold_back_timeout" },
-        { NET_X25_FORWARD,                      "x25_forward" },
-        {}
-};
-static const struct trans_ctl_table trans_net_tr_table[] = {
-        { NET_TR_RIF_TIMEOUT,   "rif_timeout" },
-        {}
-};
-static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
-        { NET_DECNET_CONF_DEV_FORWARDING,       "forwarding" },
-        { NET_DECNET_CONF_DEV_PRIORITY,         "priority" },
-        { NET_DECNET_CONF_DEV_T2,               "t2" },
-        { NET_DECNET_CONF_DEV_T3,               "t3" },
-        {}
-};
-static const struct trans_ctl_table trans_net_decnet_conf[] = {
-        { 0, NULL, trans_net_decnet_conf_vars },
-        {}
-};
-static const struct trans_ctl_table trans_net_decnet_table[] = {
-        { NET_DECNET_CONF,              "conf", trans_net_decnet_conf },
-        { NET_DECNET_NODE_ADDRESS,      "node_address" },
-        { NET_DECNET_NODE_NAME,         "node_name" },
-        { NET_DECNET_DEFAULT_DEVICE,    "default_device" },
-        { NET_DECNET_TIME_WAIT,         "time_wait" },
-        { NET_DECNET_DN_COUNT,          "dn_count" },
-        { NET_DECNET_DI_COUNT,          "di_count" },
-        { NET_DECNET_DR_COUNT,          "dr_count" },
-        { NET_DECNET_DST_GC_INTERVAL,   "dst_gc_interval" },
-        { NET_DECNET_NO_FC_MAX_CWND,    "no_fc_max_cwnd" },
-        { NET_DECNET_MEM,               "decnet_mem" },
-        { NET_DECNET_RMEM,              "decnet_rmem" },
-        { NET_DECNET_WMEM,              "decnet_wmem" },
-        { NET_DECNET_DEBUG_LEVEL,       "debug" },
-        {}
-};
-static const struct trans_ctl_table trans_net_sctp_table[] = {
-        { NET_SCTP_RTO_INITIAL,         "rto_initial" },
-        { NET_SCTP_RTO_MIN,             "rto_min" },
-        { NET_SCTP_RTO_MAX,             "rto_max" },
-        { NET_SCTP_RTO_ALPHA,           "rto_alpha_exp_divisor" },
-        { NET_SCTP_RTO_BETA,            "rto_beta_exp_divisor" },
-        { NET_SCTP_VALID_COOKIE_LIFE,   "valid_cookie_life" },
-        { NET_SCTP_ASSOCIATION_MAX_RETRANS,     "association_max_retrans" },
-        { NET_SCTP_PATH_MAX_RETRANS,    "path_max_retrans" },
-        { NET_SCTP_MAX_INIT_RETRANSMITS,        "max_init_retransmits" },
-        { NET_SCTP_HB_INTERVAL,         "hb_interval" },
-        { NET_SCTP_PRESERVE_ENABLE,     "cookie_preserve_enable" },
-        { NET_SCTP_MAX_BURST,           "max_burst" },
-        { NET_SCTP_ADDIP_ENABLE,        "addip_enable" },
-        { NET_SCTP_PRSCTP_ENABLE,       "prsctp_enable" },
-        { NET_SCTP_SNDBUF_POLICY,       "sndbuf_policy" },
-        { NET_SCTP_SACK_TIMEOUT,        "sack_timeout" },
-        { NET_SCTP_RCVBUF_POLICY,       "rcvbuf_policy" },
-        {}
-};
-static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
-        { NET_LLC2_ACK_TIMEOUT,         "ack" },
-        { NET_LLC2_P_TIMEOUT,           "p" },
-        { NET_LLC2_REJ_TIMEOUT,         "rej" },
-        { NET_LLC2_BUSY_TIMEOUT,        "busy" },
-        {}
-};
-static const struct trans_ctl_table trans_net_llc_station_table[] = {
-        { NET_LLC_STATION_ACK_TIMEOUT,  "ack_timeout" },
-        {}
-};
-static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
-        { NET_LLC2,             "timeout",      trans_net_llc_llc2_timeout_table },
-        {}
-};
-static const struct trans_ctl_table trans_net_llc_table[] = {
-        { NET_LLC2,             "llc2",         trans_net_llc_llc2_table },
-        { NET_LLC_STATION,      "station",      trans_net_llc_station_table },
-        {}
-};
-static const struct trans_ctl_table trans_net_netfilter_table[] = {
-        { NET_NF_CONNTRACK_MAX,                         "nf_conntrack_max" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,        "nf_conntrack_tcp_timeout_syn_sent" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,        "nf_conntrack_tcp_timeout_syn_recv" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,     "nf_conntrack_tcp_timeout_established" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,        "nf_conntrack_tcp_timeout_fin_wait" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,      "nf_conntrack_tcp_timeout_close_wait" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,        "nf_conntrack_tcp_timeout_last_ack" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,       "nf_conntrack_tcp_timeout_time_wait" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,           "nf_conntrack_tcp_timeout_close" },
-        { NET_NF_CONNTRACK_UDP_TIMEOUT,                 "nf_conntrack_udp_timeout" },
-        { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM,          "nf_conntrack_udp_timeout_stream" },
-        { NET_NF_CONNTRACK_ICMP_TIMEOUT,        "nf_conntrack_icmp_timeout" },
-        { NET_NF_CONNTRACK_GENERIC_TIMEOUT,             "nf_conntrack_generic_timeout" },
-        { NET_NF_CONNTRACK_BUCKETS,                     "nf_conntrack_buckets" },
-        { NET_NF_CONNTRACK_LOG_INVALID,                 "nf_conntrack_log_invalid" },
-        { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,     "nf_conntrack_tcp_timeout_max_retrans" },
-        { NET_NF_CONNTRACK_TCP_LOOSE,                   "nf_conntrack_tcp_loose" },
-        { NET_NF_CONNTRACK_TCP_BE_LIBERAL,              "nf_conntrack_tcp_be_liberal" },
-        { NET_NF_CONNTRACK_TCP_MAX_RETRANS,             "nf_conntrack_tcp_max_retrans" },
-        { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,         "nf_conntrack_sctp_timeout_closed" },
-        { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,    "nf_conntrack_sctp_timeout_cookie_wait" },
-        { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,  "nf_conntrack_sctp_timeout_cookie_echoed" },
-        { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,    "nf_conntrack_sctp_timeout_established" },
-        { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,  "nf_conntrack_sctp_timeout_shutdown_sent" },
-        { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,  "nf_conntrack_sctp_timeout_shutdown_recd" },
-        { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,      "nf_conntrack_sctp_timeout_shutdown_ack_sent" },
-        { NET_NF_CONNTRACK_COUNT,                       "nf_conntrack_count" },
-        { NET_NF_CONNTRACK_ICMPV6_TIMEOUT,      "nf_conntrack_icmpv6_timeout" },
-        { NET_NF_CONNTRACK_FRAG6_TIMEOUT,               "nf_conntrack_frag6_timeout" },
-        { NET_NF_CONNTRACK_FRAG6_LOW_THRESH,            "nf_conntrack_frag6_low_thresh" },
-        { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,           "nf_conntrack_frag6_high_thresh" },
-        { NET_NF_CONNTRACK_CHECKSUM,                    "nf_conntrack_checksum" },
-        {}
-};
-static const struct trans_ctl_table trans_net_dccp_table[] = {
-        { NET_DCCP_DEFAULT,     "default" },
-        {}
-};
-static const struct trans_ctl_table trans_net_irda_table[] = {
-        { NET_IRDA_DISCOVERY,           "discovery" },
-        { NET_IRDA_DEVNAME,             "devname" },
-        { NET_IRDA_DEBUG,               "debug" },
-        { NET_IRDA_FAST_POLL,           "fast_poll_increase" },
-        { NET_IRDA_DISCOVERY_SLOTS,     "discovery_slots" },
-        { NET_IRDA_DISCOVERY_TIMEOUT,   "discovery_timeout" },
-        { NET_IRDA_SLOT_TIMEOUT,        "slot_timeout" },
-        { NET_IRDA_MAX_BAUD_RATE,       "max_baud_rate" },
-        { NET_IRDA_MIN_TX_TURN_TIME,    "min_tx_turn_time" },
-        { NET_IRDA_MAX_TX_DATA_SIZE,    "max_tx_data_size" },
-        { NET_IRDA_MAX_TX_WINDOW,       "max_tx_window" },
-        { NET_IRDA_MAX_NOREPLY_TIME,    "max_noreply_time" },
-        { NET_IRDA_WARN_NOREPLY_TIME,   "warn_noreply_time" },
-        { NET_IRDA_LAP_KEEPALIVE_TIME,  "lap_keepalive_time" },
-        {}
-};
-static const struct trans_ctl_table trans_net_table[] = {
-        { NET_CORE,             "core",         trans_net_core_table },
-        /* NET_ETHER not used */
-        /* NET_802 not used */
-        { NET_UNIX,             "unix",         trans_net_unix_table },
-        { NET_IPV4,             "ipv4",         trans_net_ipv4_table },
-        { NET_IPX,              "ipx",          trans_net_ipx_table },
-        { NET_ATALK,            "appletalk",    trans_net_atalk_table },
-        { NET_NETROM,           "netrom",       trans_net_netrom_table },
-        { NET_AX25,             "ax25",         trans_net_ax25_table },
-        { NET_BRIDGE,           "bridge",       trans_net_bridge_table },
-        { NET_ROSE,             "rose",         trans_net_rose_table },
-        { NET_IPV6,             "ipv6",         trans_net_ipv6_table },
-        { NET_X25,              "x25",          trans_net_x25_table },
-        { NET_TR,               "token-ring",   trans_net_tr_table },
-        { NET_DECNET,           "decnet",       trans_net_decnet_table },
-        /*  NET_ECONET not used */
-        { NET_SCTP,             "sctp",         trans_net_sctp_table },
-        { NET_LLC,              "llc",          trans_net_llc_table },
-        { NET_NETFILTER,        "netfilter",    trans_net_netfilter_table },
-        { NET_DCCP,             "dccp",         trans_net_dccp_table },
-        { NET_IRDA,             "irda",         trans_net_irda_table },
-        { 2089,                 "nf_conntrack_max" },
-        {}
-};
-static const struct trans_ctl_table trans_fs_quota_table[] = {
-        { FS_DQ_LOOKUPS,        "lookups" },
-        { FS_DQ_DROPS,          "drops" },
-        { FS_DQ_READS,          "reads" },
-        { FS_DQ_WRITES,         "writes" },
-        { FS_DQ_CACHE_HITS,     "cache_hits" },
-        { FS_DQ_ALLOCATED,      "allocated_dquots" },
-        { FS_DQ_FREE,           "free_dquots" },
-        { FS_DQ_SYNCS,          "syncs" },
-        { FS_DQ_WARNINGS,       "warnings" },
-        {}
-};
-static const struct trans_ctl_table trans_fs_xfs_table[] = {
-        { XFS_SGID_INHERIT,     "irix_sgid_inherit" },
-        { XFS_SYMLINK_MODE,     "irix_symlink_mode" },
-        { XFS_PANIC_MASK,       "panic_mask" },
-        { XFS_ERRLEVEL,         "error_level" },
-        { XFS_SYNCD_TIMER,      "xfssyncd_centisecs" },
-        { XFS_INHERIT_SYNC,     "inherit_sync" },
-        { XFS_INHERIT_NODUMP,   "inherit_nodump" },
-        { XFS_INHERIT_NOATIME,  "inherit_noatime" },
-        { XFS_BUF_TIMER,        "xfsbufd_centisecs" },
-        { XFS_BUF_AGE,          "age_buffer_centisecs" },
-        { XFS_INHERIT_NOSYM,    "inherit_nosymlinks" },
-        { XFS_ROTORSTEP,        "rotorstep" },
-        { XFS_INHERIT_NODFRG,   "inherit_nodefrag" },
-        { XFS_FILESTREAM_TIMER, "filestream_centisecs" },
-        { XFS_STATS_CLEAR,      "stats_clear" },
-        {}
-};
-static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
-        { 1, "hb_ctl_path" },
-        {}
-};
-static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
-        { 1,    "nm",   trans_fs_ocfs2_nm_table },
-        {}
-};
-static const struct trans_ctl_table trans_inotify_table[] = {
-        { INOTIFY_MAX_USER_INSTANCES,   "max_user_instances" },
-        { INOTIFY_MAX_USER_WATCHES,     "max_user_watches" },
-        { INOTIFY_MAX_QUEUED_EVENTS,    "max_queued_events" },
-        {}
-};
-static const struct trans_ctl_table trans_fs_table[] = {
-        { FS_NRINODE,           "inode-nr" },
-        { FS_STATINODE,         "inode-state" },
-        /* FS_MAXINODE unused */
-        /* FS_NRDQUOT unused */
-        /* FS_MAXDQUOT unused */
-        { FS_NRFILE,            "file-nr" },
-        { FS_MAXFILE,           "file-max" },
-        { FS_DENTRY,            "dentry-state" },
-        /* FS_NRSUPER unused */
-        /* FS_MAXUPSER unused */
-        { FS_OVERFLOWUID,       "overflowuid" },
-        { FS_OVERFLOWGID,       "overflowgid" },
-        { FS_LEASES,            "leases-enable" },
-        { FS_DIR_NOTIFY,        "dir-notify-enable" },
-        { FS_LEASE_TIME,        "lease-break-time" },
-        { FS_DQSTATS,           "quota",                trans_fs_quota_table },
-        { FS_XFS,               "xfs",                  trans_fs_xfs_table },
-        { FS_AIO_NR,            "aio-nr" },
-        { FS_AIO_MAX_NR,        "aio-max-nr" },
-        { FS_INOTIFY,           "inotify",              trans_inotify_table },
-        { FS_OCFS2,             "ocfs2",                trans_fs_ocfs2_table },
-        { KERN_SETUID_DUMPABLE, "suid_dumpable" },
-        {}
-};
-static const struct trans_ctl_table trans_debug_table[] = {
-        {}
-};
-static const struct trans_ctl_table trans_cdrom_table[] = {
-        { DEV_CDROM_INFO,               "info" },
-        { DEV_CDROM_AUTOCLOSE,          "autoclose" },
-        { DEV_CDROM_AUTOEJECT,          "autoeject" },
-        { DEV_CDROM_DEBUG,              "debug" },
-        { DEV_CDROM_LOCK,               "lock" },
-        { DEV_CDROM_CHECK_MEDIA,        "check_media" },
-        {}
-};
-static const struct trans_ctl_table trans_ipmi_table[] = {
-        { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
-        {}
-};
-static const struct trans_ctl_table trans_mac_hid_files[] = {
-        /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
-        /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
-        { DEV_MAC_HID_MOUSE_BUTTON_EMULATION,   "mouse_button_emulation" },
-        { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE,    "mouse_button2_keycode" },
-        { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE,    "mouse_button3_keycode" },
-        /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
-        {}
-};
-static const struct trans_ctl_table trans_raid_table[] = {
-        { DEV_RAID_SPEED_LIMIT_MIN,     "speed_limit_min" },
-        { DEV_RAID_SPEED_LIMIT_MAX,     "speed_limit_max" },
-        {}
-};
-static const struct trans_ctl_table trans_scsi_table[] = {
-        { DEV_SCSI_LOGGING_LEVEL, "logging_level" },
-        {}
-};
-static const struct trans_ctl_table trans_parport_default_table[] = {
-        { DEV_PARPORT_DEFAULT_TIMESLICE,        "timeslice" },
-        { DEV_PARPORT_DEFAULT_SPINTIME,         "spintime" },
-        {}
-};
-static const struct trans_ctl_table trans_parport_device_table[] = {
-        { DEV_PARPORT_DEVICE_TIMESLICE,         "timeslice" },
-        {}
-};
-static const struct trans_ctl_table trans_parport_devices_table[] = {
-        { DEV_PARPORT_DEVICES_ACTIVE,           "active" },
-        { 0, NULL, trans_parport_device_table },
-        {}
-};
-static const struct trans_ctl_table trans_parport_parport_table[] = {
-        { DEV_PARPORT_SPINTIME,         "spintime" },
-        { DEV_PARPORT_BASE_ADDR,        "base-addr" },
-        { DEV_PARPORT_IRQ,              "irq" },
-        { DEV_PARPORT_DMA,              "dma" },
-        { DEV_PARPORT_MODES,            "modes" },
-        { DEV_PARPORT_DEVICES,          "devices",      trans_parport_devices_table },
-        { DEV_PARPORT_AUTOPROBE,        "autoprobe" },
-        { DEV_PARPORT_AUTOPROBE + 1,    "autoprobe0" },
-        { DEV_PARPORT_AUTOPROBE + 2,    "autoprobe1" },
-        { DEV_PARPORT_AUTOPROBE + 3,    "autoprobe2" },
-        { DEV_PARPORT_AUTOPROBE + 4,    "autoprobe3" },
-        {}
-};
-static const struct trans_ctl_table trans_parport_table[] = {
-        { DEV_PARPORT_DEFAULT,  "default",      trans_parport_default_table },
-        { 0, NULL, trans_parport_parport_table },
-        {}
-};
-static const struct trans_ctl_table trans_dev_table[] = {
-        { DEV_CDROM,    "cdrom",        trans_cdrom_table },
-        /* DEV_HWMON unused */
-        { DEV_PARPORT,  "parport",      trans_parport_table },
-        { DEV_RAID,     "raid",         trans_raid_table },
-        { DEV_MAC_HID,  "mac_hid",      trans_mac_hid_files },
-        { DEV_SCSI,     "scsi",         trans_scsi_table },
-        { DEV_IPMI,     "ipmi",         trans_ipmi_table },
-        {}
-};
-static const struct trans_ctl_table trans_bus_isa_table[] = {
-        { BUS_ISA_MEM_BASE,     "membase" },
-        { BUS_ISA_PORT_BASE,    "portbase" },
-        { BUS_ISA_PORT_SHIFT,   "portshift" },
-        {}
-};
-static const struct trans_ctl_table trans_bus_table[] = {
-        { CTL_BUS_ISA,  "isa",  trans_bus_isa_table },
-        {}
-};
-static const struct trans_ctl_table trans_arlan_conf_table0[] = {
-        { 1,    "spreadingCode" },
-        { 2,    "channelNumber" },
-        { 3,    "scramblingDisable" },
-        { 4,    "txAttenuation" },
-        { 5,    "systemId" },
-        { 6,    "maxDatagramSize" },
-        { 7,    "maxFrameSize" },
-        { 8,    "maxRetries" },
-        { 9,    "receiveMode" },
-        { 10,   "priority" },
-        { 11,   "rootOrRepeater" },
-        { 12,   "SID" },
-        { 13,   "registrationMode" },
-        { 14,   "registrationFill" },
-        { 15,   "localTalkAddress" },
-        { 16,   "codeFormat" },
-        { 17,   "numChannels" },
-        { 18,   "channel1" },
-        { 19,   "channel2" },
-        { 20,   "channel3" },
-        { 21,   "channel4" },
-        { 22,   "txClear" },
-        { 23,   "txRetries" },
-        { 24,   "txRouting" },
-        { 25,   "txScrambled" },
-        { 26,   "rxParameter" },
-        { 27,   "txTimeoutMs" },
-        { 28,   "waitCardTimeout" },
-        { 29,   "channelSet" },
-        { 30,   "name" },
-        { 31,   "waitTime" },
-        { 32,   "lParameter" },
-        { 33,   "_15" },
-        { 34,   "headerSize" },
-        { 36,   "tx_delay_ms" },
-        { 37,   "retries" },
-        { 38,   "ReTransmitPacketMaxSize" },
-        { 39,   "waitReTransmitPacketMaxSize" },
-        { 40,   "fastReTransCount" },
-        { 41,   "driverRetransmissions" },
-        { 42,   "txAckTimeoutMs" },
-        { 43,   "registrationInterrupts" },
-        { 44,   "hardwareType" },
-        { 45,   "radioType" },
-        { 46,   "writeEEPROM" },
-        { 47,   "writeRadioType" },
-        { 48,   "entry_exit_debug" },
-        { 49,   "debug" },
-        { 50,   "in_speed" },
-        { 51,   "out_speed" },
-        { 52,   "in_speed10" },
-        { 53,   "out_speed10" },
-        { 54,   "in_speed_max" },
-        { 55,   "out_speed_max" },
-        { 56,   "measure_rate" },
-        { 57,   "pre_Command_Wait" },
-        { 58,   "rx_tweak1" },
-        { 59,   "rx_tweak2" },
-        { 60,   "tx_queue_len" },
-        { 150,  "arlan0-txRing" },
-        { 151,  "arlan0-rxRing" },
-        { 152,  "arlan0-18" },
-        { 153,  "arlan0-ring" },
-        { 154,  "arlan0-shm-cpy" },
-        { 155,  "config0" },
-        { 156,  "reset0" },
-        {}
-};
-static const struct trans_ctl_table trans_arlan_conf_table1[] = {
-        { 1,    "spreadingCode" },
-        { 2,    "channelNumber" },
-        { 3,    "scramblingDisable" },
-        { 4,    "txAttenuation" },
-        { 5,    "systemId" },
-        { 6,    "maxDatagramSize" },
-        { 7,    "maxFrameSize" },
-        { 8,    "maxRetries" },
-        { 9,    "receiveMode" },
-        { 10,   "priority" },
-        { 11,   "rootOrRepeater" },
-        { 12,   "SID" },
-        { 13,   "registrationMode" },
-        { 14,   "registrationFill" },
-        { 15,   "localTalkAddress" },
-        { 16,   "codeFormat" },
-        { 17,   "numChannels" },
-        { 18,   "channel1" },
-        { 19,   "channel2" },
-        { 20,   "channel3" },
-        { 21,   "channel4" },
-        { 22,   "txClear" },
-        { 23,   "txRetries" },
-        { 24,   "txRouting" },
-        { 25,   "txScrambled" },
-        { 26,   "rxParameter" },
-        { 27,   "txTimeoutMs" },
-        { 28,   "waitCardTimeout" },
-        { 29,   "channelSet" },
-        { 30,   "name" },
-        { 31,   "waitTime" },
-        { 32,   "lParameter" },
-        { 33,   "_15" },
-        { 34,   "headerSize" },
-        { 36,   "tx_delay_ms" },
-        { 37,   "retries" },
-        { 38,   "ReTransmitPacketMaxSize" },
-        { 39,   "waitReTransmitPacketMaxSize" },
-        { 40,   "fastReTransCount" },
-        { 41,   "driverRetransmissions" },
-        { 42,   "txAckTimeoutMs" },
-        { 43,   "registrationInterrupts" },
-        { 44,   "hardwareType" },
-        { 45,   "radioType" },
-        { 46,   "writeEEPROM" },
-        { 47,   "writeRadioType" },
-        { 48,   "entry_exit_debug" },
-        { 49,   "debug" },
-        { 50,   "in_speed" },
-        { 51,   "out_speed" },
-        { 52,   "in_speed10" },
-        { 53,   "out_speed10" },
-        { 54,   "in_speed_max" },
-        { 55,   "out_speed_max" },
-        { 56,   "measure_rate" },
-        { 57,   "pre_Command_Wait" },
-        { 58,   "rx_tweak1" },
-        { 59,   "rx_tweak2" },
-        { 60,   "tx_queue_len" },
-        { 150,  "arlan1-txRing" },
-        { 151,  "arlan1-rxRing" },
-        { 152,  "arlan1-18" },
-        { 153,  "arlan1-ring" },
-        { 154,  "arlan1-shm-cpy" },
-        { 155,  "config1" },
-        { 156,  "reset1" },
-        {}
-};
-static const struct trans_ctl_table trans_arlan_conf_table2[] = {
-        { 1,    "spreadingCode" },
-        { 2,    "channelNumber" },
-        { 3,    "scramblingDisable" },
-        { 4,    "txAttenuation" },
-        { 5,    "systemId" },
-        { 6,    "maxDatagramSize" },
-        { 7,    "maxFrameSize" },
-        { 8,    "maxRetries" },
-        { 9,    "receiveMode" },
-        { 10,   "priority" },
-        { 11,   "rootOrRepeater" },
-        { 12,   "SID" },
-        { 13,   "registrationMode" },
-        { 14,   "registrationFill" },
-        { 15,   "localTalkAddress" },
-        { 16,   "codeFormat" },
-        { 17,   "numChannels" },
-        { 18,   "channel1" },
-        { 19,   "channel2" },
-        { 20,   "channel3" },
-        { 21,   "channel4" },
-        { 22,   "txClear" },
-        { 23,   "txRetries" },
-        { 24,   "txRouting" },
-        { 25,   "txScrambled" },
-        { 26,   "rxParameter" },
-        { 27,   "txTimeoutMs" },
-        { 28,   "waitCardTimeout" },
-        { 29,   "channelSet" },
-        { 30,   "name" },
-        { 31,   "waitTime" },
-        { 32,   "lParameter" },
-        { 33,   "_15" },
-        { 34,   "headerSize" },
-        { 36,   "tx_delay_ms" },
-        { 37,   "retries" },
-        { 38,   "ReTransmitPacketMaxSize" },
-        { 39,   "waitReTransmitPacketMaxSize" },
-        { 40,   "fastReTransCount" },
-        { 41,   "driverRetransmissions" },
-        { 42,   "txAckTimeoutMs" },
-        { 43,   "registrationInterrupts" },
-        { 44,   "hardwareType" },
-        { 45,   "radioType" },
-        { 46,   "writeEEPROM" },
-        { 47,   "writeRadioType" },
-        { 48,   "entry_exit_debug" },
-        { 49,   "debug" },
-        { 50,   "in_speed" },
-        { 51,   "out_speed" },
-        { 52,   "in_speed10" },
-        { 53,   "out_speed10" },
-        { 54,   "in_speed_max" },
-        { 55,   "out_speed_max" },
-        { 56,   "measure_rate" },
-        { 57,   "pre_Command_Wait" },
-        { 58,   "rx_tweak1" },
-        { 59,   "rx_tweak2" },
-        { 60,   "tx_queue_len" },
-        { 150,  "arlan2-txRing" },
-        { 151,  "arlan2-rxRing" },
-        { 152,  "arlan2-18" },
-        { 153,  "arlan2-ring" },
-        { 154,  "arlan2-shm-cpy" },
-        { 155,  "config2" },
-        { 156,  "reset2" },
-        {}
-};
-static const struct trans_ctl_table trans_arlan_conf_table3[] = {
-        { 1,    "spreadingCode" },
-        { 2,    "channelNumber" },
-        { 3,    "scramblingDisable" },
-        { 4,    "txAttenuation" },
-        { 5,    "systemId" },
-        { 6,    "maxDatagramSize" },
-        { 7,    "maxFrameSize" },
-        { 8,    "maxRetries" },
-        { 9,    "receiveMode" },
-        { 10,   "priority" },
-        { 11,   "rootOrRepeater" },
-        { 12,   "SID" },
-        { 13,   "registrationMode" },
-        { 14,   "registrationFill" },
-        { 15,   "localTalkAddress" },
-        { 16,   "codeFormat" },
-        { 17,   "numChannels" },
-        { 18,   "channel1" },
-        { 19,   "channel2" },
-        { 20,   "channel3" },
-        { 21,   "channel4" },
-        { 22,   "txClear" },
-        { 23,   "txRetries" },
-        { 24,   "txRouting" },
-        { 25,   "txScrambled" },
-        { 26,   "rxParameter" },
-        { 27,   "txTimeoutMs" },
-        { 28,   "waitCardTimeout" },
-        { 29,   "channelSet" },
-        { 30,   "name" },
-        { 31,   "waitTime" },
-        { 32,   "lParameter" },
-        { 33,   "_15" },
-        { 34,   "headerSize" },
-        { 36,   "tx_delay_ms" },
-        { 37,   "retries" },
-        { 38,   "ReTransmitPacketMaxSize" },
-        { 39,   "waitReTransmitPacketMaxSize" },
-        { 40,   "fastReTransCount" },
-        { 41,   "driverRetransmissions" },
-        { 42,   "txAckTimeoutMs" },
-        { 43,   "registrationInterrupts" },
-        { 44,   "hardwareType" },
-        { 45,   "radioType" },
-        { 46,   "writeEEPROM" },
-        { 47,   "writeRadioType" },
-        { 48,   "entry_exit_debug" },
-        { 49,   "debug" },
-        { 50,   "in_speed" },
-        { 51,   "out_speed" },
-        { 52,   "in_speed10" },
-        { 53,   "out_speed10" },
-        { 54,   "in_speed_max" },
-        { 55,   "out_speed_max" },
-        { 56,   "measure_rate" },
-        { 57,   "pre_Command_Wait" },
-        { 58,   "rx_tweak1" },
-        { 59,   "rx_tweak2" },
-        { 60,   "tx_queue_len" },
-        { 150,  "arlan3-txRing" },
-        { 151,  "arlan3-rxRing" },
-        { 152,  "arlan3-18" },
-        { 153,  "arlan3-ring" },
-        { 154,  "arlan3-shm-cpy" },
-        { 155,  "config3" },
-        { 156,  "reset3" },
-        {}
-};
-static const struct trans_ctl_table trans_arlan_table[] = {
-        { 1,            "arlan0",       trans_arlan_conf_table0 },
-        { 2,            "arlan1",       trans_arlan_conf_table1 },
-        { 3,            "arlan2",       trans_arlan_conf_table2 },
-        { 4,            "arlan3",       trans_arlan_conf_table3 },
-        {}
-};
-static const struct trans_ctl_table trans_s390dbf_table[] = {
-        { 5678 /* CTL_S390DBF_STOPPABLE */,     "debug_stoppable" },
-        { 5679 /* CTL_S390DBF_ACTIVE */,        "debug_active" },
-        {}
-};
-static const struct trans_ctl_table trans_sunrpc_table[] = {
-        { CTL_RPCDEBUG,         "rpc_debug" },
-        { CTL_NFSDEBUG,         "nfs_debug" },
-        { CTL_NFSDDEBUG,        "nfsd_debug" },
-        { CTL_NLMDEBUG,         "nlm_debug" },
-        { CTL_SLOTTABLE_UDP,    "udp_slot_table_entries" },
-        { CTL_SLOTTABLE_TCP,    "tcp_slot_table_entries" },
-        { CTL_MIN_RESVPORT,     "min_resvport" },
-        { CTL_MAX_RESVPORT,     "max_resvport" },
-        {}
-};
-static const struct trans_ctl_table trans_pm_table[] = {
-        { 1 /* CTL_PM_SUSPEND */,       "suspend" },
-        { 2 /* CTL_PM_CMODE */,         "cmode" },
-        { 3 /* CTL_PM_P0 */,            "p0" },
-        { 4 /* CTL_PM_CM */,            "cm" },
-        {}
-};
-static const struct trans_ctl_table trans_frv_table[] = {
-        { 1,    "cache-mode" },
-        { 2,    "pin-cxnr" },
-        {}
-};
-static const struct trans_ctl_table trans_root_table[] = {
-        { CTL_KERN,     "kernel",       trans_kern_table },
-        { CTL_VM,       "vm",           trans_vm_table },
-        { CTL_NET,      "net",          trans_net_table },
-        /* CTL_PROC not used */
-        { CTL_FS,       "fs",           trans_fs_table },
-        { CTL_DEBUG,    "debug",        trans_debug_table },
-        { CTL_DEV,      "dev",          trans_dev_table },
-        { CTL_BUS,      "bus",          trans_bus_table },
-        { CTL_ABI,      "abi" },
-        /* CTL_CPU not used */
-        { CTL_ARLAN,    "arlan",        trans_arlan_table },
-        { CTL_S390DBF,  "s390dbf",      trans_s390dbf_table },
-        { CTL_SUNRPC,   "sunrpc",       trans_sunrpc_table },
-        { CTL_PM,       "pm",           trans_pm_table },
-        { CTL_FRV,      "frv",          trans_frv_table },
-        {}
-};
 static int sysctl_depth(struct ctl_table *table)
 {
@@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
        return table;
 }
-static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
-{
-        struct ctl_table *test;
-        const struct trans_ctl_table *ref;
-        int cur_depth;
-        cur_depth = sysctl_depth(table);
-        ref = trans_root_table;
-repeat:
-        test = sysctl_parent(table, cur_depth);
-        for (; ref->ctl_name || ref->procname || ref->child; ref++) {
-                int match = 0;
-                if (cur_depth && !ref->child)
-                        continue;
-                if (test->procname && ref->procname &&
-                        (strcmp(test->procname, ref->procname) == 0))
-                        match++;
-                if (test->ctl_name && ref->ctl_name &&
-                        (test->ctl_name == ref->ctl_name))
-                        match++;
-                if (!ref->ctl_name && !ref->procname)
-                        match++;
-                if (match) {
-                        if (cur_depth != 0) {
-                                cur_depth--;
-                                ref = ref->child;
-                                goto repeat;
-                        }
-                        goto out;
-                }
-        }
-        ref = NULL;
-out:
-        return ref;
-}
 static void sysctl_print_path(struct ctl_table *table)
 {
@@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table)
                }
        }
        printk(" ");
-        if (table->ctl_name) {
-                for (i = depth; i >= 0; i--) {
-                        tmp = sysctl_parent(table, i);
-                        printk(".%d", tmp->ctl_name);
-                }
-        }
-}
-static void sysctl_repair_table(struct ctl_table *table)
-{
-        /* Don't complain about the classic default
-         * sysctl strategy routine.  Maybe later we
-         * can get the tables fixed and complain about
-         * this.
-         */
-        if (table->ctl_name && table->procname &&
-                (table->proc_handler == proc_dointvec) &&
-                (!table->strategy)) {
-                table->strategy = sysctl_data;
-        }
 }
 static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
@@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
                ref = head->ctl_table;
 repeat:
                test = sysctl_parent(table, cur_depth);
-                for (; ref->ctl_name || ref->procname; ref++) {
+                for (; ref->procname; ref++) {
                        int match = 0;
                        if (cur_depth && !ref->child)
                                continue;
@@ -1361,10 +67,6 @@ repeat:
                            (strcmp(test->procname, ref->procname) == 0))
                                        match++;
-                        if (test->ctl_name && ref->ctl_name &&
-                            (test->ctl_name == ref->ctl_name))
-                                match++;
                        if (match) {
                                if (cur_depth != 0) {
                                        cur_depth--;
@@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
        *fail = str;
 }
-static int sysctl_check_dir(struct nsproxy *namespaces,
-                                struct ctl_table *table)
-{
-        struct ctl_table *ref;
-        int error;
-        error = 0;
-        ref = sysctl_check_lookup(namespaces, table);
-        if (ref) {
-                int match = 0;
-                if ((!table->procname && !ref->procname) ||
-                    (table->procname && ref->procname &&
-                     (strcmp(table->procname, ref->procname) == 0)))
-                        match++;
-                if ((!table->ctl_name && !ref->ctl_name) ||
-                    (table->ctl_name && ref->ctl_name &&
-                     (table->ctl_name == ref->ctl_name)))
-                        match++;
-                if (match != 2) {
-                        printk(KERN_ERR "%s: failed: ", __func__);
-                        sysctl_print_path(table);
-                        printk(" ref: ");
-                        sysctl_print_path(ref);
-                        printk("\n");
-                        error = -EINVAL;
-                }
-        }
-        return error;
-}
 static void sysctl_check_leaf(struct nsproxy *namespaces,
                                struct ctl_table *table, const char **fail)
 {
@@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
                set_fail(fail, table, "Sysctl already exists");
 }
-static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
-{
-        const struct trans_ctl_table *ref;
-        ref = sysctl_binary_lookup(table);
-        if (table->ctl_name && !ref)
-                set_fail(fail, table, "Unknown sysctl binary path");
-        if (ref) {
-                if (ref->procname &&
-                    (!table->procname ||
-                     (strcmp(table->procname, ref->procname) != 0)))
-                        set_fail(fail, table, "procname does not match binary path procname");
-                if (ref->ctl_name && table->ctl_name &&
-                    (table->ctl_name != ref->ctl_name))
-                        set_fail(fail, table, "ctl_name does not match binary path ctl_name");
-        }
-}
 int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 {
        int error = 0;
-        for (; table->ctl_name || table->procname; table++) {
+        for (; table->procname; table++) {
                const char *fail = NULL;
-                sysctl_repair_table(table);
                if (table->parent) {
                        if (table->procname && !table->parent->procname)
                                set_fail(&fail, table, "Parent without procname");
-                        if (table->ctl_name && !table->parent->ctl_name)
-                                set_fail(&fail, table, "Parent without ctl_name");
                }
                if (!table->procname)
                        set_fail(&fail, table, "No procname");
@@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                                set_fail(&fail, table, "Writable sysctl directory");
                        if (table->proc_handler)
                                set_fail(&fail, table, "Directory with proc_handler");
-                        if (table->strategy)
-                                set_fail(&fail, table, "Directory with strategy");
                        if (table->extra1)
                                set_fail(&fail, table, "Directory with extra1");
                        if (table->extra2)
                                set_fail(&fail, table, "Directory with extra2");
-                        if (sysctl_check_dir(namespaces, table))
-                                set_fail(&fail, table, "Inconsistent directory names");
                } else {
-                        if ((table->strategy == sysctl_data) ||
+                        if ((table->proc_handler == proc_dostring) ||
-                            (table->strategy == sysctl_string) ||
-                            (table->strategy == sysctl_intvec) ||
-                            (table->strategy == sysctl_jiffies) ||
-                            (table->strategy == sysctl_ms_jiffies) ||
-                            (table->proc_handler == proc_dostring) ||
                            (table->proc_handler == proc_dointvec) ||
                            (table->proc_handler == proc_dointvec_minmax) ||
                            (table->proc_handler == proc_dointvec_jiffies) ||
@@ -1513,14 +152,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                                                set_fail(&fail, table, "No max");
                                }
                        }
-#ifdef CONFIG_SYSCTL_SYSCALL
-                        if (table->ctl_name && !table->strategy)
-                                set_fail(&fail, table, "Missing strategy");
-#endif
-#if 0
-                        if (!table->ctl_name && table->strategy)
-                                set_fail(&fail, table, "Strategy without ctl_name");
-#endif
 #ifdef CONFIG_PROC_SYSCTL
                        if (table->procname && !table->proc_handler)
                                set_fail(&fail, table, "No proc_handler");
@@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 #endif
                        sysctl_check_leaf(namespaces, table, &fail);
                }
-                sysctl_check_bin_path(table, &fail);
                if (table->mode > 0777)
                        set_fail(&fail, table, "bogus .mode");
                if (fail) {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/cgroupstats.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
        .maxattr        = TASKSTATS_CMD_ATTR_MAX,
 };
-static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
-__read_mostly = {
        [TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
        [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
-static struct nla_policy
+static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
-cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
        [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469a7fec..656dccfe1cbb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ptrace.h>
@@ -662,6 +661,36 @@ u64 nsec_to_clock_t(u64 x)
 #endif
 }
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n:  nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
+{
+#if (NSEC_PER_SEC % HZ) == 0
+        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
+        return div_u64(n, NSEC_PER_SEC / HZ);
+#elif (HZ % 512) == 0
+        /* overflow after 292 years if HZ = 1024 */
+        return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
+#else
+        /*
+         * Generic case - optimized for cases where HZ is a multiple of 3.
+         * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
+         */
+        return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
+#endif
+}
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..d7395fdfb9f3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
 #include <linux/sysdev.h>
 #include <linux/tick.h>
+#include "tick-internal.h"
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
 static LIST_HEAD(clockevents_released);
@@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released);
 static RAW_NOTIFIER_HEAD(clockevents_chain);
 /* Protection for the above */
-static DEFINE_SPINLOCK(clockevents_lock);
+static DEFINE_RAW_SPINLOCK(clockevents_lock);
 /**
 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
 *
 * Math helper, returns latch value converted to nanoseconds (bound checked)
 */
-unsigned long clockevent_delta2ns(unsigned long latch,
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
-                                  struct clock_event_device *evt)
 {
-        u64 clc = ((u64) latch << evt->shift);
+        u64 clc = (u64) latch << evt->shift;
        if (unlikely(!evt->mult)) {
                evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
        do_div(clc, evt->mult);
        if (clc < 1000)
                clc = 1000;
-        if (clc > LONG_MAX)
+        if (clc > KTIME_MAX)
-                clc = LONG_MAX;
+                clc = KTIME_MAX;
-        return (unsigned long) clc;
+        return clc;
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
@@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
        unsigned long flags;
        int ret;
-        spin_lock_irqsave(&clockevents_lock, flags);
+        raw_spin_lock_irqsave(&clockevents_lock, flags);
        ret = raw_notifier_chain_register(&clockevents_chain, nb);
-        spin_unlock_irqrestore(&clockevents_lock, flags);
+        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
        return ret;
 }
@@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
        BUG_ON(!dev->cpumask);
-        spin_lock_irqsave(&clockevents_lock, flags);
+        raw_spin_lock_irqsave(&clockevents_lock, flags);
        list_add(&dev->list, &clockevent_devices);
        clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
        clockevents_notify_released();
-        spin_unlock_irqrestore(&clockevents_lock, flags);
+        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
@@ -237,10 +238,11 @@ void clockevents_exchange_device(struct clock_event_device *old,
 */
 void clockevents_notify(unsigned long reason, void *arg)
 {
-        struct list_head *node, *tmp;
+        struct clock_event_device *dev, *tmp;
        unsigned long flags;
+        int cpu;
-        spin_lock_irqsave(&clockevents_lock, flags);
+        raw_spin_lock_irqsave(&clockevents_lock, flags);
        clockevents_do_notify(reason, arg);
        switch (reason) {
@@ -249,13 +251,25 @@ void clockevents_notify(unsigned long reason, void *arg)
                 * Unregister the clock event devices which were
                 * released from the users in the notify chain.
                 */
-                list_for_each_safe(node, tmp, &clockevents_released)
+                list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
-                        list_del(node);
+                        list_del(&dev->list);
+                /*
+                 * Now check whether the CPU has left unused per cpu devices
+                 */
+                cpu = *((int *)arg);
+                list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
+                        if (cpumask_test_cpu(cpu, dev->cpumask) &&
+                            cpumask_weight(dev->cpumask) == 1 &&
+                            !tick_is_broadcast_device(dev)) {
+                                BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+                                list_del(&dev->list);
+                        }
+                }
                break;
        default:
                break;
        }
-        spin_unlock_irqrestore(&clockevents_lock, flags);
+        raw_spin_unlock_irqrestore(&clockevents_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 #endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5e18c6ab2c6a..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc,
        tc->cycle_last = cc->read(cc);
        tc->nsec = start_tstamp;
 }
-EXPORT_SYMBOL(timecounter_init);
+EXPORT_SYMBOL_GPL(timecounter_init);
 /**
 * timecounter_read_delta - get nanoseconds since last call of this function
@@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc)
        return nsec;
 }
-EXPORT_SYMBOL(timecounter_read);
+EXPORT_SYMBOL_GPL(timecounter_read);
 u64 timecounter_cyc2time(struct timecounter *tc,
                         cycle_t cycle_tstamp)
@@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc,
        return nsec;
 }
-EXPORT_SYMBOL(timecounter_cyc2time);
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult:       pointer to mult variable
+ * @shift:      pointer to shift variable
+ * @from:       frequency to convert from
+ * @to:         frequency to convert to
+ * @minsec:     guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @minsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+{
+        u64 tmp;
+        u32 sft, sftacc= 32;
+        /*
+         * Calculate the shift factor which is limiting the conversion
+         * range:
+         */
+        tmp = ((u64)minsec * from) >> 32;
+        while (tmp) {
+                tmp >>=1;
+                sftacc--;
+        }
+        /*
+         * Find the conversion shift/mult pair which has the best
+         * accuracy and fits the maxsec conversion range:
+         */
+        for (sft = 32; sft > 0; sft--) {
+                tmp = (u64) to << sft;
+                do_div(tmp, from);
+                if ((tmp >> sftacc) == 0)
+                        break;
+        }
+        *mult = tmp;
+        *shift = sft;
+}
 /*[Clocksource internal variables]---------
 * curr_clocksource:
@@ -290,7 +343,19 @@ static void clocksource_resume_watchdog(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&watchdog_lock, flags);
+        /*
+         * We use trylock here to avoid a potential dead lock when
+         * kgdb calls this code after the kernel has been stopped with
+         * watchdog_lock held. When watchdog_lock is held we just
+         * return and accept, that the watchdog might trigger and mark
+         * the monitored clock source (usually TSC) unstable.
+         *
+         * This does not affect the other caller clocksource_resume()
+         * because at this point the kernel is UP, interrupts are
+         * disabled and nothing can hold watchdog_lock.
+         */
+        if (!spin_trylock_irqsave(&watchdog_lock, flags))
+                return;
        clocksource_reset_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
@@ -388,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 /**
+ * clocksource_suspend - suspend the clocksource(s)
+ */
+void clocksource_suspend(void)
+{
+        struct clocksource *cs;
+        list_for_each_entry_reverse(cs, &clocksource_list, list)
+                if (cs->suspend)
+                        cs->suspend(cs);
+}
+/**
 * clocksource_resume - resume the clocksource(s)
 */
 void clocksource_resume(void)
@@ -396,7 +473,7 @@ void clocksource_resume(void)
        list_for_each_entry(cs, &clocksource_list, list)
                if (cs->resume)
-                        cs->resume();
+                        cs->resume(cs);
        clocksource_resume_watchdog();
 }
@@ -405,14 +482,55 @@ void clocksource_resume(void)
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
+ * to incorrectly trip the watchdog. This might fail when the kernel
- *
+ * was stopped in code which holds watchdog_lock.
 */
 void clocksource_touch_watchdog(void)
 {
        clocksource_resume_watchdog();
 }
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+        u64 max_nsecs, max_cycles;
+        /*
+         * Calculate the maximum number of cycles that we can pass to the
+         * cyc2ns function without overflowing a 64-bit signed result. The
+         * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+         * is equivalent to the below.
+         * max_cycles < (2^63)/cs->mult
+         * max_cycles < 2^(log2((2^63)/cs->mult))
+         * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+         * max_cycles < 2^(63 - log2(cs->mult))
+         * max_cycles < 1 << (63 - log2(cs->mult))
+         * Please note that we add 1 to the result of the log2 to account for
+         * any rounding errors, ensure the above inequality is satisfied and
+         * no overflow will occur.
+         */
+        max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+        /*
+         * The actual maximum number of cycles we can defer the clocksource is
+         * determined by the minimum of max_cycles and cs->mask.
+         */
+        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+        /*
+         * To ensure that the clocksource does not wrap whilst we are idle,
+         * limit the time the clocksource can be deferred by 12.5%. Please
+         * note a margin of 12.5% is used because this can be computed with
+         * a shift, versus say 10% which would require division.
+         */
+        return max_nsecs - (max_nsecs >> 5);
+}
 #ifdef CONFIG_GENERIC_TIME
 /**
@@ -474,6 +592,10 @@ static inline void clocksource_select(void) { }
 */
 static int __init clocksource_done_booting(void)
 {
+        mutex_lock(&clocksource_mutex);
+        curr_clocksource = clocksource_default_clock();
+        mutex_unlock(&clocksource_mutex);
        finished_booting = 1;
        /*
@@ -511,6 +633,9 @@ static void clocksource_enqueue(struct clocksource *cs)
 */
 int clocksource_register(struct clocksource *cs)
 {
+        /* calculate max idle time permitted for this clocksource */
+        cs->max_idle_ns = clocksource_max_deferment(cs);
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
        clocksource_select();
@@ -580,7 +705,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 * @count:      length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
- * clocksource selction.
+ * clocksource selection.
 */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64			time_offset;
 static long                     time_constant = 2;
 /* maximum error (usecs):                                               */
-long                            time_maxerror = NTP_PHASE_LIMIT;
+static long                     time_maxerror = NTP_PHASE_LIMIT;
 /* estimated error (usecs):                                             */
-long                            time_esterror = NTP_PHASE_LIMIT;
+static long                     time_esterror = NTP_PHASE_LIMIT;
 /* frequency offset (scaled nsecs/secs):                                */
 static s64                      time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
         * Select how the frequency is to be controlled
         * and in which mode (PLL or FLL).
         */
-        secs = xtime.tv_sec - time_reftime;
+        secs = get_seconds() - time_reftime;
        if (unlikely(time_status & STA_FREQHOLD))
                secs = 0;
-        time_reftime = xtime.tv_sec;
+        time_reftime = get_seconds();
        offset64    = offset;
        freq_adj    = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
         * reference time to current time.
         */
        if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
-                time_reftime = xtime.tv_sec;
+                time_reftime = get_seconds();
        /* only set allowed bits */
        time_status &= STA_RONLY;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a35..b3bafd5fc66d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
 /* FIXME: Use cpumask_var_t. */
 static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
 static DECLARE_BITMAP(tmpmask, NR_CPUS);
-static DEFINE_SPINLOCK(tick_broadcast_lock);
+static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
 #ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
        unsigned long flags;
        int ret = 0;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
                        tick_broadcast_clear_oneshot(cpu);
                }
        }
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
        return ret;
 }
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
 */
 static void tick_do_periodic_broadcast(void)
 {
-        spin_lock(&tick_broadcast_lock);
+        raw_spin_lock(&tick_broadcast_lock);
        cpumask_and(to_cpumask(tmpmask),
                    cpu_online_mask, tick_get_broadcast_mask());
        tick_do_broadcast(to_cpumask(tmpmask));
-        spin_unlock(&tick_broadcast_lock);
+        raw_spin_unlock(&tick_broadcast_lock);
 }
 /*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
        unsigned long flags;
        int cpu, bc_stopped;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        cpu = smp_processor_id();
        td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                        tick_broadcast_setup_oneshot(bc);
        }
 out:
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 /*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        unsigned long flags;
        unsigned int cpu = *cpup;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
        cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
                        clockevents_shutdown(bc);
        }
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
        struct clock_event_device *bc;
        unsigned long flags;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
        if (bc)
                clockevents_shutdown(bc);
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
        unsigned long flags;
        int broadcast = 0;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
                        break;
                }
        }
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
        return broadcast;
 }
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
        ktime_t now, next_event;
        int cpu;
-        spin_lock(&tick_broadcast_lock);
+        raw_spin_lock(&tick_broadcast_lock);
 again:
        dev->next_event.tv64 = KTIME_MAX;
        next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
                if (tick_broadcast_set_event(next_event, 0))
                        goto again;
        }
-        spin_unlock(&tick_broadcast_lock);
+        raw_spin_unlock(&tick_broadcast_lock);
 }
 /*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        unsigned long flags;
        int cpu;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        }
 out:
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 /*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
        struct clock_event_device *bc;
        unsigned long flags;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
        bc = tick_broadcast_device.evtdev;
        if (bc)
                tick_broadcast_setup_oneshot(bc);
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        unsigned long flags;
        unsigned int cpu = *cpup;
-        spin_lock_irqsave(&tick_broadcast_lock, flags);
+        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        /*
         * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
         */
        cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
-        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 /*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3c..b6b898d2eeef 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 ktime_t tick_next_period;
 ktime_t tick_period;
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
-DEFINE_SPINLOCK(tick_device_lock);
+static DEFINE_RAW_SPINLOCK(tick_device_lock);
 /*
 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        cpu = smp_processor_id();
        if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
        return NOTIFY_STOP;
 out_bc:
@@ -278,7 +278,7 @@ out_bc:
        if (tick_check_broadcast_device(newdev))
                ret = NOTIFY_STOP;
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
        return ret;
 }
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
        struct clock_event_device *dev = td->evtdev;
        unsigned long flags;
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        td->mode = TICKDEV_MODE_PERIODIC;
        if (dev) {
                /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
                clockevents_exchange_device(dev, NULL);
                td->evtdev = NULL;
        }
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
        struct tick_device *td = &__get_cpu_var(tick_cpu_device);
        unsigned long flags;
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        clockevents_shutdown(td->evtdev);
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
        unsigned long flags;
        int broadcast = tick_resume_broadcast();
-        spin_lock_irqsave(&tick_device_lock, flags);
+        raw_spin_lock_irqsave(&tick_device_lock, flags);
        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
        if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
                else
                        tick_resume_oneshot();
        }
-        spin_unlock_irqrestore(&tick_device_lock, flags);
+        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee0..290eefbc1f60 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
 #define TICK_DO_TIMER_BOOT      -2
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
-extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
 extern ktime_t tick_period;
 extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
 #include "tick-internal.h"
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+        /* Nothing to do if we already reached the limit */
+        if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+                return -ETIME;
+        if (dev->min_delta_ns < 5000)
+                dev->min_delta_ns = 5000;
+        else
+                dev->min_delta_ns += dev->min_delta_ns >> 1;
+        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+                dev->min_delta_ns = MIN_DELTA_LIMIT;
+        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+               dev->name ? dev->name : "?",
+               (unsigned long long) dev->min_delta_ns);
+        return 0;
+}
 /**
 * tick_program_event internal worker function
 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
                if (!ret || !force)
                        return ret;
+                dev->retries++;
                /*
-                 * We tried 2 times to program the device with the given
+                 * We tried 3 times to program the device with the given
-                 * min_delta_ns. If that's not working then we double it
+                 * min_delta_ns. If that's not working then we increase it
                 * and emit a warning.
                 */
                if (++i > 2) {
                        /* Increase the min. delta and try again */
-                        if (!dev->min_delta_ns)
+                        if (tick_increase_min_delta(dev)) {
-                                dev->min_delta_ns = 5000;
+                                /*
-                        else
+                                 * Get out of the loop if min_delta_ns
-                                dev->min_delta_ns += dev->min_delta_ns >> 1;
+                                 * hit the limit already. That's
+                                 * better than staying here forever.
-                        printk(KERN_WARNING
+                                 *
-                               "CE: %s increasing min_delta_ns to %lu nsec\n",
+                                 * We clear next_event so we have a
-                               dev->name ? dev->name : "?",
+                                 * chance that the box survives.
-                               dev->min_delta_ns << 1);
+                                 */
+                                printk(KERN_WARNING
+                                       "CE: Reprogramming failure. Giving up\n");
+                                dev->next_event.tv64 = KTIME_MAX;
+                                return -ETIME;
+                        }
                        i = 0;
                }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index dcbff7515489..0adc54bd7c7c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
 * value. We do this unconditionally on any cpu, as we don't know whether the
 * cpu, which has the update task assigned is in a long sleep.
 */
-static void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(ktime_t now)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-        ktime_t now;
-        if (!ts->tick_stopped)
-                return;
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
-        now = ktime_get();
        ts->idle_waketime = now;
        local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
        touch_softlockup_watchdog();
 }
-static void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t delta;
-        if (ts->idle_active) {
+        delta = ktime_sub(now, ts->idle_entrytime);
-                ktime_t now, delta;
+        ts->idle_lastupdate = now;
-                now = ktime_get();
+        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                delta = ktime_sub(now, ts->idle_entrytime);
+        ts->idle_active = 0;
-                ts->idle_lastupdate = now;
-                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                ts->idle_active = 0;
-                sched_clock_idle_wakeup_event(0);
+        sched_clock_idle_wakeup_event(0);
-        }
 }
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        struct tick_sched *ts;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        u64 time_delta;
        int cpu;
        local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                if (ratelimit < 10) {
                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-                               local_softirq_pending());
+                               (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
                goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
                seq = read_seqbegin(&xtime_lock);
                last_update = last_jiffies_update;
                last_jiffies = jiffies;
+                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
-        /* Get the next timer wheel timer */
+        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-        next_jiffies = get_next_timer_interrupt(last_jiffies);
+            arch_needs_cpu(cpu)) {
-        delta_jiffies = next_jiffies - last_jiffies;
+                next_jiffies = last_jiffies + 1;
-        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
                delta_jiffies = 1;
+        } else {
+                /* Get the next timer wheel timer */
+                next_jiffies = get_next_timer_interrupt(last_jiffies);
+                delta_jiffies = next_jiffies - last_jiffies;
+        }
        /*
         * Do not stop the tick, if we are only one off
         * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
        if ((long)delta_jiffies >= 1) {
                /*
-                * calculate the expiry time for the next timer wheel
-                * timer
-                */
-                expires = ktime_add_ns(last_update, tick_period.tv64 *
-                                   delta_jiffies);
-                /*
                 * If this cpu is the one which updates jiffies, then
                 * give up the assignment and let it be taken by the
                 * cpu which runs the tick timer next, which might be
                 * this cpu as well. If we don't drop this here the
                 * jiffies might be stale and do_timer() never
-                 * invoked.
+                 * invoked. Keep track of the fact that it was the one
+                 * which had the do_timer() duty last. If this cpu is
+                 * the one which had the do_timer() duty last, we
+                 * limit the sleep time to the timekeeping
+                 * max_deferement value which we retrieved
+                 * above. Otherwise we can sleep as long as we want.
                 */
-                if (cpu == tick_do_timer_cpu)
+                if (cpu == tick_do_timer_cpu) {
                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+                        ts->do_timer_last = 1;
+                } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+                        time_delta = KTIME_MAX;
+                        ts->do_timer_last = 0;
+                } else if (!ts->do_timer_last) {
+                        time_delta = KTIME_MAX;
+                }
+                /*
+                 * calculate the expiry time for the next timer wheel
+                 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+                 * that there is no timer pending or at least extremely
+                 * far into the future (12 days for HZ=1000). In this
+                 * case we set the expiry to the end of time.
+                 */
+                if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+                        /*
+                         * Calculate the time delta for the next timer event.
+                         * If the time delta exceeds the maximum time delta
+                         * permitted by the current clocksource then adjust
+                         * the time delta accordingly to ensure the
+                         * clocksource does not wrap.
+                         */
+                        time_delta = min_t(u64, time_delta,
+                                           tick_period.tv64 * delta_jiffies);
+                }
+                if (time_delta < KTIME_MAX)
+                        expires = ktime_add_ns(last_update, time_delta);
+                else
+                        expires.tv64 = KTIME_MAX;
                if (delta_jiffies > 1)
                        cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
                ts->idle_sleeps++;
+                /* Mark expires */
+                ts->idle_expires = expires;
                /*
-                 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+                 * If the expiration time == KTIME_MAX, then
-                 * there is no timer pending or at least extremly far
+                 * in this case we simply stop the tick timer.
-                 * into the future (12 days for HZ=1000). In this case
-                 * we simply stop the tick timer:
                 */
-                if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
+                 if (unlikely(expires.tv64 == KTIME_MAX)) {
-                        ts->idle_expires.tv64 = KTIME_MAX;
                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
                                hrtimer_cancel(&ts->sched_timer);
                        goto out;
                }
-                /* Mark expiries */
-                ts->idle_expires = expires;
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
                                      HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
        local_irq_disable();
-        tick_nohz_stop_idle(cpu);
+        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+                now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
        if (!ts->inidle || !ts->tick_stopped) {
                ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
-        now = ktime_get();
        tick_do_update_jiffies64(now);
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
 * timer and do not touch the other magic bits which need to be done
 * when idle is left.
 */
-static void tick_nohz_kick_tick(int cpu)
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
 {
 #if 0
        /* Switch back to 2.6.27 behaviour */
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t delta, now;
+        ktime_t delta;
-        if (!ts->tick_stopped)
-                return;
        /*
         * Do not touch the tick device, when the next expiry is either
         * already reached or less/equal than the tick period.
         */
-        now = ktime_get();
        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
        if (delta.tv64 <= tick_period.tv64)
                return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
 #endif
 }
+static inline void tick_check_nohz(int cpu)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t now;
+        if (!ts->idle_active && !ts->tick_stopped)
+                return;
+        now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
+        if (ts->tick_stopped) {
+                tick_nohz_update_jiffies(now);
+                tick_nohz_kick_tick(cpu, now);
+        }
+}
 #else
 static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
 #endif /* NO_HZ */
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 void tick_check_idle(int cpu)
 {
        tick_check_oneshot_broadcast(cpu);
-#ifdef CONFIG_NO_HZ
+        tick_check_nohz(cpu);
-        tick_nohz_stop_idle(cpu);
-        tick_nohz_update_jiffies();
-        tick_nohz_kick_tick(cpu);
-#endif
 }
 /*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 71e7f1a19156..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
 #include <linux/timecompare.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 /*
@@ -40,7 +41,7 @@ ktime_t timecompare_transform(struct timecompare *sync,
        return ns_to_ktime(nsec);
 }
-EXPORT_SYMBOL(timecompare_transform);
+EXPORT_SYMBOL_GPL(timecompare_transform);
 int timecompare_offset(struct timecompare *sync,
                       s64 *offset,
@@ -89,7 +90,7 @@ int timecompare_offset(struct timecompare *sync,
                         * source time
                         */
                        sample.offset =
-                                ktime_to_ns(ktime_add(end, start)) / 2 -
+                                (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
                                ts;
                        /* simple insertion sort based on duration */
@@ -131,7 +132,7 @@ int timecompare_offset(struct timecompare *sync,
        return used;
 }
-EXPORT_SYMBOL(timecompare_offset);
+EXPORT_SYMBOL_GPL(timecompare_offset);
 void __timecompare_update(struct timecompare *sync,
                          u64 source_tstamp)
@@ -188,4 +189,4 @@ void __timecompare_update(struct timecompare *sync,
                }
        }
 }
-EXPORT_SYMBOL(__timecompare_update);
+EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907eaa..39f6177fafac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsecond)
 {
        xtime.tv_sec += leapsecond;
        wall_to_monotonic.tv_sec -= leapsecond;
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 #ifdef CONFIG_GENERIC_TIME
@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
        timekeeper.ntp_error = 0;
        ntp_clear();
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -488,6 +488,17 @@ int timekeeping_valid_for_hres(void)
 }
 /**
+ * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ *
+ * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
+ * ensure that the clocksource does not change!
+ */
+u64 timekeeping_max_deferment(void)
+{
+        return timekeeper.clock->max_idle_ns;
+}
+/**
 * read_persistent_clock -  Return time from the persistent clock.
 *
 * Weak dummy function for arches that do not yet support it.
@@ -611,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
        write_sequnlock_irqrestore(&xtime_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+        clocksource_suspend();
        return 0;
 }
@@ -722,6 +734,51 @@ static void timekeeping_adjust(s64 offset)
                                timekeeper.ntp_error_shift;
 }
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+        /* If the offset is smaller then a shifted interval, do nothing */
+        if (offset < timekeeper.cycle_interval<<shift)
+                return offset;
+        /* Accumulate one shifted interval */
+        offset -= timekeeper.cycle_interval << shift;
+        timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+        while (timekeeper.xtime_nsec >= nsecps) {
+                timekeeper.xtime_nsec -= nsecps;
+                xtime.tv_sec++;
+                second_overflow();
+        }
+        /* Accumulate into raw time */
+        raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+        while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+                raw_time.tv_nsec -= NSEC_PER_SEC;
+                raw_time.tv_sec++;
+        }
+        /* Accumulate error between NTP and clock interval */
+        timekeeper.ntp_error += tick_length << shift;
+        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+                                (timekeeper.ntp_error_shift + shift);
+        return offset;
+}
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
@@ -732,6 +789,7 @@ void update_wall_time(void)
        struct clocksource *clock;
        cycle_t offset;
        u64 nsecs;
+        int shift = 0, maxshift;
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
@@ -745,33 +803,23 @@ void update_wall_time(void)
 #endif
        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
-        /* normally this loop will run just once, however in the
+        /*
-         * case of lost or late ticks, it will accumulate correctly.
+         * With NO_HZ we may have to accumulate many cycle_intervals
+         * (think "ticks") worth of time at once. To do this efficiently,
+         * we calculate the largest doubling multiple of cycle_intervals
+         * that is smaller then the offset. We then accumulate that
+         * chunk in one go, and then try to consume the next smaller
+         * doubled multiple.
         */
+        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = max(0, shift);
+        /* Bound shift to one less then what overflows tick_length */
+        maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
-                u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+                offset = logarithmic_accumulation(offset, shift);
+                if(offset < timekeeper.cycle_interval<<shift)
-                /* accumulate one interval */
+                        shift--;
-                offset -= timekeeper.cycle_interval;
-                clock->cycle_last += timekeeper.cycle_interval;
-                timekeeper.xtime_nsec += timekeeper.xtime_interval;
-                if (timekeeper.xtime_nsec >= nsecps) {
-                        timekeeper.xtime_nsec -= nsecps;
-                        xtime.tv_sec++;
-                        second_overflow();
-                }
-                raw_time.tv_nsec += timekeeper.raw_interval;
-                if (raw_time.tv_nsec >= NSEC_PER_SEC) {
-                        raw_time.tv_nsec -= NSEC_PER_SEC;
-                        raw_time.tv_sec++;
-                }
-                /* accumulate error between NTP and clock interval */
-                timekeeper.ntp_error += tick_length;
-                timekeeper.ntp_error -= timekeeper.xtime_interval <<
-                                        timekeeper.ntp_error_shift;
        }
        /* correct the clock when NTP error is too big */
@@ -811,7 +859,7 @@ void update_wall_time(void)
        update_xtime_cache(nsecs);
        /* check to see if there is a new clocksource to use */
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 /**
@@ -834,6 +882,7 @@ void getboottime(struct timespec *ts)
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
+EXPORT_SYMBOL_GPL(getboottime);
 /**
 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -843,6 +892,7 @@ void monotonic_to_bootbased(struct timespec *ts)
 {
        *ts = timespec_add_safe(*ts, total_sleep_time);
 }
+EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdfd..1a4a7dd78777 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
 next_one:
        i = 0;
-        spin_lock_irqsave(&base->cpu_base->lock, flags);
+        raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
        curr = base->first;
        /*
@@ -100,13 +100,13 @@ next_one:
                timer = rb_entry(curr, struct hrtimer, node);
                tmp = *timer;
-                spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+                raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
                print_timer(m, timer, &tmp, i, now);
                next++;
                goto next_one;
        }
-        spin_unlock_irqrestore(&base->cpu_base->lock, flags);
+        raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
 }
 static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        P_ns(expires_next);
        P(hres_active);
        P(nr_events);
+        P(nr_retries);
+        P(nr_hangs);
+        P_ns(max_hang_time);
 #endif
 #undef P
 #undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
                return;
        }
        SEQ_printf(m, "%s\n", dev->name);
-        SEQ_printf(m, " max_delta_ns:   %lu\n", dev->max_delta_ns);
+        SEQ_printf(m, " max_delta_ns:   %llu\n",
-        SEQ_printf(m, " min_delta_ns:   %lu\n", dev->min_delta_ns);
+                   (unsigned long long) dev->max_delta_ns);
-        SEQ_printf(m, " mult:           %lu\n", dev->mult);
+        SEQ_printf(m, " min_delta_ns:   %llu\n",
-        SEQ_printf(m, " shift:          %d\n", dev->shift);
+                   (unsigned long long) dev->min_delta_ns);
+        SEQ_printf(m, " mult:           %u\n", dev->mult);
+        SEQ_printf(m, " shift:          %u\n", dev->shift);
        SEQ_printf(m, " mode:           %d\n", dev->mode);
        SEQ_printf(m, " next_event:     %Ld nsecs\n",
                   (unsigned long long) ktime_to_ns(dev->next_event));
@@ -223,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        SEQ_printf(m, " event_handler:  ");
        print_name_offset(m, dev->event_handler);
        SEQ_printf(m, "\n");
+        SEQ_printf(m, " retries:        %lu\n", dev->retries);
 }
 static void timer_list_show_tickdevices(struct seq_file *m)
@@ -232,10 +238,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
        print_tickdevice(m, tick_get_broadcast_device(), -1);
        SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
-                   tick_get_broadcast_mask()->bits[0]);
+                   cpumask_bits(tick_get_broadcast_mask())[0]);
 #ifdef CONFIG_TICK_ONESHOT
        SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
-                   tick_get_broadcast_oneshot_mask()->bits[0]);
+                   cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
 #endif
        SEQ_printf(m, "\n");
 #endif
@@ -252,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.4\n");
+        SEQ_printf(m, "Timer List Version: v0.6\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7ec..2f3b585b8d7d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
 /*
 * Per-CPU lookup locks for fast hash lookup:
 */
-static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
 /*
 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        /*
         * It doesnt matter which lock we take:
         */
-        spinlock_t *lock;
+        raw_spinlock_t *lock;
        struct entry *entry, input;
        unsigned long flags;
        if (likely(!timer_stats_active))
                return;
-        lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+        lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
        input.timer = timer;
        input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        input.pid = pid;
        input.timer_flag = timer_flag;
-        spin_lock_irqsave(lock, flags);
+        raw_spin_lock_irqsave(lock, flags);
        if (!timer_stats_active)
                goto out_unlock;
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
                atomic_inc(&overflow_count);
 out_unlock:
-        spin_unlock_irqrestore(lock, flags);
+        raw_spin_unlock_irqrestore(lock, flags);
 }
 static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
        int cpu;
        for_each_online_cpu(cpu) {
-                spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
+                raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
+                raw_spin_lock_irqsave(lock, flags);
                /* nothing */
-                spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
+                raw_spin_unlock_irqrestore(lock, flags);
        }
 }
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
        int cpu;
        for_each_possible_cpu(cpu)
-                spin_lock_init(&per_cpu(lookup_lock, cpu));
+                raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
 }
 static int __init init_tstats_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d26811..aeb6a54f2771 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -656,8 +657,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        debug_activate(timer, expires);
-        new_base = __get_cpu_var(tvec_bases);
        cpu = smp_processor_id();
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -882,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
        if (base->running_timer == timer)
                goto out;
+        timer_stats_timer_clear_start_info(timer);
        ret = 0;
        if (timer_pending(timer)) {
                detach_timer(timer, 1);
@@ -1200,6 +1200,7 @@ void update_process_times(int user_tick)
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
        printk_tick();
+        perf_event_do_pending();
        scheduler_tick();
        run_posix_cpu_timers(p);
 }
@@ -1211,8 +1212,6 @@ static void run_timer_softirq(struct softirq_action *h)
 {
        struct tvec_base *base = __get_cpu_var(tvec_bases);
-        perf_event_do_pending();
        hrtimer_run_pending();
        if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17f..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
 config HAVE_FTRACE_NMI_ENTER
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FUNCTION_TRACER
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FUNCTION_GRAPH_TRACER
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FUNCTION_GRAPH_FP_TEST
        bool
        help
-         An arch may pass in a unique value (frame pointer) to both the
+          See Documentation/trace/ftrace-design.txt
-         entering and exiting of a function. On exit, the value is compared
-         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_DYNAMIC_FTRACE
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_FTRACE_MCOUNT_RECORD
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config HAVE_HW_BRANCH_TRACER
        bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
 config HAVE_SYSCALL_TRACEPOINTS
        bool
        help
-          See Documentation/trace/ftrace-implementation.txt
+          See Documentation/trace/ftrace-design.txt
 config TRACER_MAX_TRACE
        bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
 # This allows those options to appear when no other tracer is selected. But the
 # options do not appear when something else selects it. We need the two options
 # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
-# hidding of the automatic options.
+# hiding of the automatic options.
 config TRACING
        bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
        bool "Tracers"
        default y if DEBUG_KERNEL
        help
-         Enable the kernel tracing infrastructure.
+          Enable the kernel tracing infrastructure.
 if FTRACE
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
        help
          Enable the kernel to trace every kernel function. This is done
          by using a compiler feature to insert a small, 5-byte No-Operation
-          instruction to the beginning of every kernel function, which NOP
+          instruction at the beginning of every kernel function, which NOP
          sequence is then dynamically patched into a tracer call when
          tracing is enabled by the administrator. If it's runtime disabled
          (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
          and its entry.
          Its first purpose is to trace the duration of functions and
          draw a call graph for each thread with some information like
-          the return value. This is done by setting the current return 
+          the return value. This is done by setting the current return
          address on the current task structure into a stack of calls.
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
-          (Note that kernel size and overhead increases with this option
+          (Note that kernel size and overhead increase with this option
          enabled. This option and the preempt-off timing option can be
          used together or separately.)
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
        select TRACER_MAX_TRACE
        select RING_BUFFER_ALLOW_SWAP
        help
-          This option measures the time spent in preemption off critical
+          This option measures the time spent in preemption-off critical
          sections, with microsecond accuracy.
          The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
              echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
-          (Note that kernel size and overhead increases with this option
+          (Note that kernel size and overhead increase with this option
          enabled. This option and the irqs-off timing option can be
          used together or separately.)
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
        depends on !GENERIC_TRACER
        select TRACING
        help
-          This tracer hooks to various trace points in the kernel
+          This tracer hooks to various trace points in the kernel,
          allowing the user to pick and choose which trace point they
          want to trace. It also includes the sched_switch tracer plugin.
@@ -265,19 +263,19 @@ choice
         The likely/unlikely profiler only looks at the conditions that
         are annotated with a likely or unlikely macro.
-         The "all branch" profiler will profile every if statement in the
+         The "all branch" profiler will profile every if-statement in the
         kernel. This profiler will also enable the likely/unlikely
-         profiler as well.
+         profiler.
-         Either of the above profilers add a bit of overhead to the system.
+         Either of the above profilers adds a bit of overhead to the system.
-         If unsure choose "No branch profiling".
+         If unsure, choose "No branch profiling".
 config BRANCH_PROFILE_NONE
        bool "No branch profiling"
        help
-         No branch profiling. Branch profiling adds a bit of overhead.
+          No branch profiling. Branch profiling adds a bit of overhead.
-         Only enable it if you want to analyse the branching behavior.
+          Only enable it if you want to analyse the branching behavior.
-         Otherwise keep it disabled.
+          Otherwise keep it disabled.
 config PROFILE_ANNOTATED_BRANCHES
        bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
          /sys/kernel/debug/tracing/profile_annotated_branch
-          Note: this will add a significant overhead, only turn this
+          Note: this will add a significant overhead; only turn this
          on if you need to profile the system's use of these macros.
 config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
          This configuration, when enabled, will impose a great overhead
          on the system. This should only be enabled when the system
-          is to be analyzed
+          is to be analyzed in much detail.
 endchoice
 config TRACING_BRANCHES
@@ -330,15 +328,27 @@ config BRANCH_TRACER
          Say N if unsure.
-config POWER_TRACER
+config KSYM_TRACER
-        bool "Trace power consumption behavior"
+        bool "Trace read and write access on kernel memory locations"
-        depends on X86
+        depends on HAVE_HW_BREAKPOINT
-        select GENERIC_TRACER
+        select TRACING
+        help
+          This tracer helps find read and write operations on any given kernel
+          symbol i.e. /proc/kallsyms.
+config PROFILE_KSYM_TRACER
+        bool "Profile all kernel memory accesses on 'watched' variables"
+        depends on KSYM_TRACER
        help
-          This tracer helps developers to analyze and optimize the kernels
+          This tracer profiles kernel accesses on variables watched through the
-          power management decisions, specifically the C-state and P-state
+          ksym tracer ftrace plugin. Depending upon the hardware, all read
-          behavior.
+          and write operations on kernel variables can be monitored for
+          accesses.
+          The results will be displayed in:
+          /debugfs/tracing/profile_ksym
+          Say N if unsure.
 config STACK_TRACER
        bool "Trace max stack"
@@ -370,14 +380,14 @@ config HW_BRANCH_TRACER
        select GENERIC_TRACER
        help
          This tracer records all branches on the system in a circular
-          buffer giving access to the last N branches for each cpu.
+          buffer, giving access to the last N branches for each cpu.
 config KMEMTRACE
        bool "Trace SLAB allocations"
        select GENERIC_TRACER
        help
          kmemtrace provides tracing for slab allocator functions, such as
-          kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+          kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
          data is then fed to the userspace application in order to analyse
          allocation hotspots, internal fragmentation and so on, making it
          possible to see how well an allocator performs, as well as debug
@@ -396,15 +406,15 @@ config WORKQUEUE_TRACER
        bool "Trace workqueues"
        select GENERIC_TRACER
        help
-          The workqueue tracer provides some statistical informations
+          The workqueue tracer provides some statistical information
          about each cpu workqueue thread such as the number of the
          works inserted and executed since their creation. It can help
-          to evaluate the amount of work each of them have to perform.
+          to evaluate the amount of work each of them has to perform.
          For example it can help a developer to decide whether he should
-          choose a per cpu workqueue instead of a singlethreaded one.
+          choose a per-cpu workqueue instead of a singlethreaded one.
 config BLK_DEV_IO_TRACE
-        bool "Support for tracing block io actions"
+        bool "Support for tracing block IO actions"
        depends on SYSFS
        depends on BLOCK
        select RELAY
@@ -428,38 +438,55 @@ config BLK_DEV_IO_TRACE
          If unsure, say N.
+config KPROBE_EVENT
+        depends on KPROBES
+        depends on HAVE_REGS_AND_STACK_ACCESS_API
+        bool "Enable kprobes-based dynamic events"
+        select TRACING
+        default y
+        help
+          This allows the user to add tracing events (similar to tracepoints)
+          on the fly via the ftrace interface. See
+          Documentation/trace/kprobetrace.txt for more details.
+          Those events can be inserted wherever kprobes can probe, and record
+          various register and memory values.
+          This option is also required by perf-probe subcommand of perf tools.
+          If you want to use perf tools, this option is strongly recommended.
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
        depends on FUNCTION_TRACER
        depends on HAVE_DYNAMIC_FTRACE
        default y
        help
-         This option will modify all the calls to ftrace dynamically
+          This option will modify all the calls to ftrace dynamically
-         (will patch them out of the binary image and replaces them
+          (will patch them out of the binary image and replace them
-         with a No-Op instruction) as they are called. A table is
+          with a No-Op instruction) as they are called. A table is
-         created to dynamically enable them again.
+          created to dynamically enable them again.
-         This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise
+          This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
-         has native performance as long as no tracing is active.
+          otherwise has native performance as long as no tracing is active.
-         The changes to the code are done by a kernel thread that
+          The changes to the code are done by a kernel thread that
-         wakes up once a second and checks to see if any ftrace calls
+          wakes up once a second and checks to see if any ftrace calls
-         were made. If so, it runs stop_machine (stops all CPUS)
+          were made. If so, it runs stop_machine (stops all CPUS)
-         and modifies the code to jump over the call to ftrace.
+          and modifies the code to jump over the call to ftrace.
 config FUNCTION_PROFILER
        bool "Kernel function profiler"
        depends on FUNCTION_TRACER
        default n
        help
-         This option enables the kernel function profiler. A file is created
+          This option enables the kernel function profiler. A file is created
-         in debugfs called function_profile_enabled which defaults to zero.
+          in debugfs called function_profile_enabled which defaults to zero.
-         When a 1 is echoed into this file profiling begins, and when a
+          When a 1 is echoed into this file profiling begins, and when a
-         zero is entered, profiling stops. A file in the trace_stats
+          zero is entered, profiling stops. A "functions" file is created in
-         directory called functions, that show the list of functions that
+          the trace_stats directory; this file shows the list of functions that
-         have been hit and their counters.
+          have been hit and their counters.
-         If in doubt, say N
+          If in doubt, say N.
 config FTRACE_MCOUNT_RECORD
        def_bool y
@@ -518,8 +545,8 @@ config RING_BUFFER_BENCHMARK
        tristate "Ring buffer benchmark stress tester"
        depends on RING_BUFFER
        help
-          This option creates a test to stress the ring buffer and bench mark it.
+          This option creates a test to stress the ring buffer and benchmark it.
-          It creates its own ring buffer such that it will not interfer with
+          It creates its own ring buffer such that it will not interfere with
          any other users of the ring buffer (such as ftrace). It then creates
          a producer and consumer that will run for 10 seconds and sleep for
          10 seconds. Each interval it will print out the number of events
@@ -528,7 +555,7 @@ config RING_BUFFER_BENCHMARK
          It does not disable interrupts or raise its priority, so it may be
          affected by processes that are running.
-          If unsure, say N
+          If unsure, say N.
 endif # FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2b..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,8 +51,12 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
-obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..b3bc91a3f510 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/smp_lock.h>
 #include <linux/time.h>
@@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (ret)
                return ret;
-        if (copy_to_user(arg, &buts, sizeof(buts)))
+        if (copy_to_user(arg, &buts, sizeof(buts))) {
+                blk_trace_remove(q);
                return -EFAULT;
+        }
        return 0;
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6dc4e5ef7a01..2404b59b3097 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,12 +22,13 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/hash.h>
+#include <linux/rcupdate.h>
 #include <trace/events/sched.h>
@@ -60,6 +61,13 @@ static int last_ftrace_enabled;
 /* Quick disabling of function tracer. */
 int function_trace_stop;
+/* List for set_ftrace_pid's pids. */
+LIST_HEAD(ftrace_pids);
+struct ftrace_pid {
+        struct list_head list;
+        struct pid *pid;
+};
 /*
 * ftrace_disabled is set when an anomaly is discovered.
 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,18 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
+/*
+ * Traverse the ftrace_list, invoking all entries.  The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
-        struct ftrace_ops *op = ftrace_list;
+        struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
-        /* in case someone actually ports this to alpha! */
-        read_barrier_depends();
        while (op != &ftrace_list_end) {
-                /* silly alpha */
-                read_barrier_depends();
                op->func(ip, parent_ip);
-                op = op->next;
+                op = rcu_dereference_raw(op->next); /*see above*/
        };
 }
@@ -144,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
         * the ops->next pointer is valid before another CPU sees
         * the ops pointer included into the ftrace_list.
         */
-        smp_wmb();
+        rcu_assign_pointer(ftrace_list, ops);
-        ftrace_list = ops;
        if (ftrace_enabled) {
                ftrace_func_t func;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
                else
                        func = ftrace_list_func;
-                if (ftrace_pid_trace) {
+                if (!list_empty(&ftrace_pids)) {
                        set_ftrace_pid_function(func);
                        func = ftrace_pid_func;
                }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
                if (ftrace_list->next == &ftrace_list_end) {
                        ftrace_func_t func = ftrace_list->func;
-                        if (ftrace_pid_trace) {
+                        if (!list_empty(&ftrace_pids)) {
                                set_ftrace_pid_function(func);
                                func = ftrace_pid_func;
                        }
@@ -231,7 +242,7 @@ static void ftrace_update_pid_func(void)
        func = __ftrace_trace_function;
 #endif
-        if (ftrace_pid_trace) {
+        if (!list_empty(&ftrace_pids)) {
                set_ftrace_pid_function(func);
                func = ftrace_pid_func;
        } else {
@@ -821,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 }
 #endif /* CONFIG_FUNCTION_PROFILER */
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -889,36 +898,6 @@ static struct dyn_ftrace *ftrace_free_records;
                }                               \
        }
-#ifdef CONFIG_KPROBES
-static int frozen_record_count;
-static inline void freeze_record(struct dyn_ftrace *rec)
-{
-        if (!(rec->flags & FTRACE_FL_FROZEN)) {
-                rec->flags |= FTRACE_FL_FROZEN;
-                frozen_record_count++;
-        }
-}
-static inline void unfreeze_record(struct dyn_ftrace *rec)
-{
-        if (rec->flags & FTRACE_FL_FROZEN) {
-                rec->flags &= ~FTRACE_FL_FROZEN;
-                frozen_record_count--;
-        }
-}
-static inline int record_frozen(struct dyn_ftrace *rec)
-{
-        return rec->flags & FTRACE_FL_FROZEN;
-}
-#else
-# define freeze_record(rec)                     ({ 0; })
-# define unfreeze_record(rec)                   ({ 0; })
-# define record_frozen(rec)                     ({ 0; })
-#endif /* CONFIG_KPROBES */
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
        rec->freelist = ftrace_free_records;
@@ -1016,6 +995,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 }
+/* Return 1 if the address range is reserved for ftrace */
+int ftrace_text_reserved(void *start, void *end)
+{
+        struct dyn_ftrace *rec;
+        struct ftrace_page *pg;
+        do_for_each_ftrace_rec(pg, rec) {
+                if (rec->ip <= (unsigned long)end &&
+                    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
+                        return 1;
+        } while_for_each_ftrace_rec();
+        return 0;
+}
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
@@ -1067,14 +1061,6 @@ static void ftrace_replace_code(int enable)
                    !(rec->flags & FTRACE_FL_CONVERTED))
                        continue;
-                /* ignore updates to this record's mcount site */
-                if (get_kprobe((void *)rec->ip)) {
-                        freeze_record(rec);
-                        continue;
-                } else {
-                        unfreeze_record(rec);
-                }
                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
                        rec->flags |= FTRACE_FL_FAILED;
@@ -1261,12 +1247,34 @@ static int ftrace_update_code(struct module *mod)
                ftrace_new_addrs = p->newlist;
                p->flags = 0L;
-                /* convert record (i.e, patch mcount-call with NOP) */
+                /*
-                if (ftrace_code_disable(mod, p)) {
+                 * Do the initial record convertion from mcount jump
-                        p->flags |= FTRACE_FL_CONVERTED;
+                 * to the NOP instructions.
-                        ftrace_update_cnt++;
+                 */
-                } else
+                if (!ftrace_code_disable(mod, p)) {
                        ftrace_free_rec(p);
+                        continue;
+                }
+                p->flags |= FTRACE_FL_CONVERTED;
+                ftrace_update_cnt++;
+                /*
+                 * If the tracing is enabled, go ahead and enable the record.
+                 *
+                 * The reason not to enable the record immediatelly is the
+                 * inherent check of ftrace_make_nop/ftrace_make_call for
+                 * correct previous instructions.  Making first the NOP
+                 * conversion puts the module to the correct state, thus
+                 * passing the ftrace_make_call check.
+                 */
+                if (ftrace_start_up) {
+                        int failed = __ftrace_replace_code(p, 1);
+                        if (failed) {
+                                ftrace_bug(failed, p->ip);
+                                ftrace_free_rec(p);
+                        }
+                }
        }
        stop = ftrace_now(raw_smp_processor_id());
@@ -1656,64 +1664,10 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
        return ret;
 }
-enum {
-        MATCH_FULL,
-        MATCH_FRONT_ONLY,
-        MATCH_MIDDLE_ONLY,
-        MATCH_END_ONLY,
-};
-/*
- * (static function - no need for kernel doc)
- *
- * Pass in a buffer containing a glob and this function will
- * set search to point to the search part of the buffer and
- * return the type of search it is (see enum above).
- * This does modify buff.
- *
- * Returns enum type.
- *  search returns the pointer to use for comparison.
- *  not returns 1 if buff started with a '!'
- *     0 otherwise.
- */
-static int
-ftrace_setup_glob(char *buff, int len, char **search, int *not)
-{
-        int type = MATCH_FULL;
-        int i;
-        if (buff[0] == '!') {
-                *not = 1;
-                buff++;
-                len--;
-        } else
-                *not = 0;
-        *search = buff;
-        for (i = 0; i < len; i++) {
-                if (buff[i] == '*') {
-                        if (!i) {
-                                *search = buff + 1;
-                                type = MATCH_END_ONLY;
-                        } else {
-                                if (type == MATCH_END_ONLY)
-                                        type = MATCH_MIDDLE_ONLY;
-                                else
-                                        type = MATCH_FRONT_ONLY;
-                                buff[i] = 0;
-                                break;
-                        }
-                }
-        }
-        return type;
-}
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
        int matched = 0;
-        char *ptr;
+        int slen;
        switch (type) {
        case MATCH_FULL:
@@ -1729,8 +1683,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
                        matched = 1;
                break;
        case MATCH_END_ONLY:
-                ptr = strstr(str, regex);
+                slen = strlen(str);
-                if (ptr && (ptr[len] == 0))
+                if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
                        matched = 1;
                break;
        }
@@ -1747,7 +1701,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
        return ftrace_match(str, regex, len, type);
 }
-static void ftrace_match_records(char *buff, int len, int enable)
+static int ftrace_match_records(char *buff, int len, int enable)
 {
        unsigned int search_len;
        struct ftrace_page *pg;
@@ -1756,9 +1710,10 @@ static void ftrace_match_records(char *buff, int len, int enable)
        char *search;
        int type;
        int not;
+        int found = 0;
        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-        type = ftrace_setup_glob(buff, len, &search, &not);
+        type = filter_parse_regex(buff, len, &search, &not);
        search_len = strlen(search);
@@ -1773,6 +1728,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
                                rec->flags &= ~flag;
                        else
                                rec->flags |= flag;
+                        found = 1;
                }
                /*
                 * Only enable filtering if we have a function that
@@ -1782,6 +1738,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
                        ftrace_filtered = 1;
        } while_for_each_ftrace_rec();
        mutex_unlock(&ftrace_lock);
+        return found;
 }
 static int
@@ -1803,7 +1761,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
                return 1;
 }
-static void ftrace_match_module_records(char *buff, char *mod, int enable)
+static int ftrace_match_module_records(char *buff, char *mod, int enable)
 {
        unsigned search_len = 0;
        struct ftrace_page *pg;
@@ -1812,6 +1770,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
        char *search = buff;
        unsigned long flag;
        int not = 0;
+        int found = 0;
        flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
@@ -1826,7 +1785,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
        }
        if (strlen(buff)) {
-                type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+                type = filter_parse_regex(buff, strlen(buff), &search, &not);
                search_len = strlen(search);
        }
@@ -1842,12 +1801,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
                                rec->flags &= ~flag;
                        else
                                rec->flags |= flag;
+                        found = 1;
                }
                if (enable && (rec->flags & FTRACE_FL_FILTER))
                        ftrace_filtered = 1;
        } while_for_each_ftrace_rec();
        mutex_unlock(&ftrace_lock);
+        return found;
 }
 /*
@@ -1876,8 +1838,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
        if (!strlen(mod))
                return -EINVAL;
-        ftrace_match_module_records(func, mod, enable);
+        if (ftrace_match_module_records(func, mod, enable))
-        return 0;
+                return 0;
+        return -EINVAL;
 }
 static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1991,7 +1954,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        int count = 0;
        char *search;
-        type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+        type = filter_parse_regex(glob, strlen(glob), &search, &not);
        len = strlen(search);
        /* we do not support '!' for function probes */
@@ -2068,7 +2031,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        else if (glob) {
                int not;
-                type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+                type = filter_parse_regex(glob, strlen(glob), &search, &not);
                len = strlen(search);
                /* we do not support '!' for function probes */
@@ -2174,8 +2137,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
        func = strsep(&next, ":");
        if (!next) {
-                ftrace_match_records(func, len, enable);
+                if (ftrace_match_records(func, len, enable))
-                return 0;
+                        return 0;
+                return ret;
        }
        /* command found */
@@ -2221,10 +2185,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
            !trace_parser_cont(parser)) {
                ret = ftrace_process_regex(parser->buffer,
                                           parser->idx, enable);
+                trace_parser_clear(parser);
                if (ret)
                        goto out_unlock;
-                trace_parser_clear(parser);
        }
        ret = read;
@@ -2312,6 +2275,34 @@ static int __init set_ftrace_filter(char *str)
 }
 __setup("ftrace_filter=", set_ftrace_filter);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+static int __init set_graph_function(char *str)
+{
+        strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
+        return 1;
+}
+__setup("ftrace_graph_filter=", set_graph_function);
+static void __init set_ftrace_early_graph(char *buf)
+{
+        int ret;
+        char *func;
+        while (buf) {
+                func = strsep(&buf, ",");
+                /* we allow only one expression at a time */
+                ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+                                      func);
+                if (ret)
+                        printk(KERN_DEBUG "ftrace: function %s not "
+                                          "traceable\n", func);
+        }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 static void __init set_ftrace_early_filter(char *buf, int enable)
 {
        char *func;
@@ -2328,6 +2319,10 @@ static void __init set_ftrace_early_filters(void)
                set_ftrace_early_filter(ftrace_filter_buf, 1);
        if (ftrace_notrace_buf[0])
                set_ftrace_early_filter(ftrace_notrace_buf, 0);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        if (ftrace_graph_buf[0])
+                set_ftrace_early_graph(ftrace_graph_buf);
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 }
 static int
@@ -2410,6 +2405,7 @@ static const struct file_operations ftrace_notrace_fops = {
 static DEFINE_MUTEX(graph_lock);
 int ftrace_graph_count;
+int ftrace_graph_filter_enabled;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 static void *
@@ -2432,7 +2428,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
        mutex_lock(&graph_lock);
        /* Nothing, tell g_show to print all functions are enabled */
-        if (!ftrace_graph_count && !*pos)
+        if (!ftrace_graph_filter_enabled && !*pos)
                return (void *)1;
        return __g_next(m, pos);
@@ -2478,6 +2474,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
        mutex_lock(&graph_lock);
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC)) {
+                ftrace_graph_filter_enabled = 0;
                ftrace_graph_count = 0;
                memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
        }
@@ -2503,7 +2500,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
        int search_len;
-        int found = 0;
+        int fail = 1;
        int type, not;
        char *search;
        bool exists;
@@ -2513,39 +2510,52 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
                return -ENODEV;
        /* decode regex */
-        type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
+        type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
-        if (not)
+        if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
-                return -EINVAL;
+                return -EBUSY;
        search_len = strlen(search);
        mutex_lock(&ftrace_lock);
        do_for_each_ftrace_rec(pg, rec) {
-                if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
-                        break;
                if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
                        continue;
                if (ftrace_match_record(rec, search, search_len, type)) {
-                        /* ensure it is not already in the array */
+                        /* if it is in the array */
                        exists = false;
-                        for (i = 0; i < *idx; i++)
+                        for (i = 0; i < *idx; i++) {
                                if (array[i] == rec->ip) {
                                        exists = true;
                                        break;
                                }
-                        if (!exists) {
+                        }
-                                array[(*idx)++] = rec->ip;
-                                found = 1;
+                        if (!not) {
+                                fail = 0;
+                                if (!exists) {
+                                        array[(*idx)++] = rec->ip;
+                                        if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+                                                goto out;
+                                }
+                        } else {
+                                if (exists) {
+                                        array[i] = array[--(*idx)];
+                                        array[*idx] = 0;
+                                        fail = 0;
+                                }
                        }
                }
        } while_for_each_ftrace_rec();
+out:
        mutex_unlock(&ftrace_lock);
-        return found ? 0 : -EINVAL;
+        if (fail)
+                return -EINVAL;
+        ftrace_graph_filter_enabled = 1;
+        return 0;
 }
 static ssize_t
@@ -2555,16 +2565,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
        struct trace_parser parser;
        ssize_t read, ret;
-        if (!cnt || cnt < 0)
+        if (!cnt)
                return 0;
        mutex_lock(&graph_lock);
-        if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
-                ret = -EBUSY;
-                goto out_unlock;
-        }
        if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
                ret = -ENOMEM;
                goto out_unlock;
@@ -2624,7 +2629,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
        return 0;
 }
-static int ftrace_convert_nops(struct module *mod,
+static int ftrace_process_locs(struct module *mod,
                               unsigned long *start,
                               unsigned long *end)
 {
@@ -2684,7 +2689,7 @@ static void ftrace_init_module(struct module *mod,
 {
        if (ftrace_disabled || start == end)
                return;
-        ftrace_convert_nops(mod, start, end);
+        ftrace_process_locs(mod, start, end);
 }
 static int ftrace_module_notify(struct notifier_block *self,
@@ -2745,7 +2750,7 @@ void __init ftrace_init(void)
        last_ftrace_enabled = ftrace_enabled = 1;
-        ret = ftrace_convert_nops(NULL,
+        ret = ftrace_process_locs(NULL,
                                  __start_mcount_loc,
                                  __stop_mcount_loc);
@@ -2778,23 +2783,6 @@ static inline void ftrace_startup_enable(int command) { }
 # define ftrace_shutdown_sysctl()       do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
-static ssize_t
-ftrace_pid_read(struct file *file, char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
-{
-        char buf[64];
-        int r;
-        if (ftrace_pid_trace == ftrace_swapper_pid)
-                r = sprintf(buf, "swapper tasks\n");
-        else if (ftrace_pid_trace)
-                r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
-        else
-                r = sprintf(buf, "no pid\n");
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
 static void clear_ftrace_swapper(void)
 {
        struct task_struct *p;
@@ -2845,14 +2833,12 @@ static void set_ftrace_pid(struct pid *pid)
        rcu_read_unlock();
 }
-static void clear_ftrace_pid_task(struct pid **pid)
+static void clear_ftrace_pid_task(struct pid *pid)
 {
-        if (*pid == ftrace_swapper_pid)
+        if (pid == ftrace_swapper_pid)
                clear_ftrace_swapper();
        else
-                clear_ftrace_pid(*pid);
+                clear_ftrace_pid(pid);
-        *pid = NULL;
 }
 static void set_ftrace_pid_task(struct pid *pid)
@@ -2863,74 +2849,184 @@ static void set_ftrace_pid_task(struct pid *pid)
                set_ftrace_pid(pid);
 }
-static ssize_t
+static int ftrace_pid_add(int p)
-ftrace_pid_write(struct file *filp, const char __user *ubuf,
-                   size_t cnt, loff_t *ppos)
 {
        struct pid *pid;
-        char buf[64];
+        struct ftrace_pid *fpid;
-        long val;
+        int ret = -EINVAL;
-        int ret;
-        if (cnt >= sizeof(buf))
+        mutex_lock(&ftrace_lock);
-                return -EINVAL;
-        if (copy_from_user(&buf, ubuf, cnt))
+        if (!p)
-                return -EFAULT;
+                pid = ftrace_swapper_pid;
+        else
+                pid = find_get_pid(p);
-        buf[cnt] = 0;
+        if (!pid)
+                goto out;
-        ret = strict_strtol(buf, 10, &val);
+        ret = 0;
-        if (ret < 0)
-                return ret;
-        mutex_lock(&ftrace_lock);
+        list_for_each_entry(fpid, &ftrace_pids, list)
-        if (val < 0) {
+                if (fpid->pid == pid)
-                /* disable pid tracing */
+                        goto out_put;
-                if (!ftrace_pid_trace)
-                        goto out;
-                clear_ftrace_pid_task(&ftrace_pid_trace);
+        ret = -ENOMEM;
-        } else {
+        fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
-                /* swapper task is special */
+        if (!fpid)
-                if (!val) {
+                goto out_put;
-                        pid = ftrace_swapper_pid;
-                        if (pid == ftrace_pid_trace)
-                                goto out;
-                } else {
-                        pid = find_get_pid(val);
-                        if (pid == ftrace_pid_trace) {
+        list_add(&fpid->list, &ftrace_pids);
-                                put_pid(pid);
+        fpid->pid = pid;
-                                goto out;
-                        }
-                }
-                if (ftrace_pid_trace)
+        set_ftrace_pid_task(pid);
-                        clear_ftrace_pid_task(&ftrace_pid_trace);
-                if (!pid)
+        ftrace_update_pid_func();
-                        goto out;
+        ftrace_startup_enable(0);
+        mutex_unlock(&ftrace_lock);
+        return 0;
+out_put:
+        if (pid != ftrace_swapper_pid)
+                put_pid(pid);
+out:
+        mutex_unlock(&ftrace_lock);
+        return ret;
+}
+static void ftrace_pid_reset(void)
+{
+        struct ftrace_pid *fpid, *safe;
+        mutex_lock(&ftrace_lock);
+        list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
+                struct pid *pid = fpid->pid;
-                ftrace_pid_trace = pid;
+                clear_ftrace_pid_task(pid);
-                set_ftrace_pid_task(ftrace_pid_trace);
+                list_del(&fpid->list);
+                kfree(fpid);
        }
-        /* update the function call */
        ftrace_update_pid_func();
        ftrace_startup_enable(0);
- out:
        mutex_unlock(&ftrace_lock);
+}
-        return cnt;
+static void *fpid_start(struct seq_file *m, loff_t *pos)
+{
+        mutex_lock(&ftrace_lock);
+        if (list_empty(&ftrace_pids) && (!*pos))
+                return (void *) 1;
+        return seq_list_start(&ftrace_pids, *pos);
+}
+static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        if (v == (void *)1)
+                return NULL;
+        return seq_list_next(v, &ftrace_pids, pos);
+}
+static void fpid_stop(struct seq_file *m, void *p)
+{
+        mutex_unlock(&ftrace_lock);
+}
+static int fpid_show(struct seq_file *m, void *v)
+{
+        const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
+        if (v == (void *)1) {
+                seq_printf(m, "no pid\n");
+                return 0;
+        }
+        if (fpid->pid == ftrace_swapper_pid)
+                seq_printf(m, "swapper tasks\n");
+        else
+                seq_printf(m, "%u\n", pid_vnr(fpid->pid));
+        return 0;
+}
+static const struct seq_operations ftrace_pid_sops = {
+        .start = fpid_start,
+        .next = fpid_next,
+        .stop = fpid_stop,
+        .show = fpid_show,
+};
+static int
+ftrace_pid_open(struct inode *inode, struct file *file)
+{
+        int ret = 0;
+        if ((file->f_mode & FMODE_WRITE) &&
+            (file->f_flags & O_TRUNC))
+                ftrace_pid_reset();
+        if (file->f_mode & FMODE_READ)
+                ret = seq_open(file, &ftrace_pid_sops);
+        return ret;
+}
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+                   size_t cnt, loff_t *ppos)
+{
+        char buf[64], *tmp;
+        long val;
+        int ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        /*
+         * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
+         * to clean the filter quietly.
+         */
+        tmp = strstrip(buf);
+        if (strlen(tmp) == 0)
+                return 1;
+        ret = strict_strtol(tmp, 10, &val);
+        if (ret < 0)
+                return ret;
+        ret = ftrace_pid_add(val);
+        return ret ? ret : cnt;
+}
+static int
+ftrace_pid_release(struct inode *inode, struct file *file)
+{
+        if (file->f_mode & FMODE_READ)
+                seq_release(inode, file);
+        return 0;
 }
 static const struct file_operations ftrace_pid_fops = {
-        .read = ftrace_pid_read,
+        .open           = ftrace_pid_open,
-        .write = ftrace_pid_write,
+        .write          = ftrace_pid_write,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = ftrace_pid_release,
 };
 static __init int ftrace_init_debugfs(void)
@@ -3258,6 +3354,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 {
        /* Make sure we do not use the parent ret_stack */
        t->ret_stack = NULL;
+        t->curr_ret_stack = -1;
        if (ftrace_graph_active) {
                struct ftrace_ret_stack *ret_stack;
@@ -3267,7 +3364,6 @@ void ftrace_graph_init_task(struct task_struct *t)
                                GFP_KERNEL);
                if (!ret_stack)
                        return;
-                t->curr_ret_stack = -1;
                atomic_set(&t->tracing_graph_pause, 0);
                atomic_set(&t->trace_overrun, 0);
                t->ftrace_timestamp = 0;
@@ -3293,4 +3389,3 @@ void ftrace_graph_stop(void)
        ftrace_stop();
 }
 #endif
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index e06c6e3d56a3..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,12 +9,9 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
 EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5dd017fea6f5..41ca394feb22 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
+#include <asm/local.h>
 #include "trace.h"
 /*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+# define RB_FORCE_8BYTE_ALIGNMENT       0
+# define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT       1
+# define RB_ARCH_ALIGNMENT              8U
+#endif
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -397,18 +407,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
        int ret;
        ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-                               "offset:0;\tsize:%u;\n",
+                               "offset:0;\tsize:%u;\tsigned:%u;\n",
-                               (unsigned int)sizeof(field.time_stamp));
+                               (unsigned int)sizeof(field.time_stamp),
+                               (unsigned int)is_signed_type(u64));
        ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
-                               "offset:%u;\tsize:%u;\n",
+                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), commit),
-                               (unsigned int)sizeof(field.commit));
+                               (unsigned int)sizeof(field.commit),
+                               (unsigned int)is_signed_type(long));
        ret = trace_seq_printf(s, "\tfield: char data;\t"
-                               "offset:%u;\tsize:%u;\n",
+                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), data),
-                               (unsigned int)BUF_PAGE_SIZE);
+                               (unsigned int)BUF_PAGE_SIZE,
+                               (unsigned int)is_signed_type(char));
        return ret;
 }
@@ -420,7 +433,7 @@ struct ring_buffer_per_cpu {
        int                             cpu;
        struct ring_buffer              *buffer;
        spinlock_t                      reader_lock;    /* serialize readers */
-        raw_spinlock_t                  lock;
+        arch_spinlock_t                 lock;
        struct lock_class_key           lock_key;
        struct list_head                *pages;
        struct buffer_page              *head_page;     /* read from head */
@@ -461,6 +474,8 @@ struct ring_buffer_iter {
        struct ring_buffer_per_cpu      *cpu_buffer;
        unsigned long                   head;
        struct buffer_page              *head_page;
+        struct buffer_page              *cache_reader_page;
+        unsigned long                   cache_read;
        u64                             read_stamp;
 };
@@ -995,7 +1010,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        cpu_buffer->buffer = buffer;
        spin_lock_init(&cpu_buffer->reader_lock);
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
-        cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
@@ -1190,30 +1205,25 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        struct list_head *p;
        unsigned i;
-        atomic_inc(&cpu_buffer->record_disabled);
-        synchronize_sched();
        spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                        return;
+                        goto out;
                p = cpu_buffer->pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
                free_buffer_page(bpage);
        }
        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                return;
+                goto out;
        rb_reset_cpu(cpu_buffer);
-        spin_unlock_irq(&cpu_buffer->reader_lock);
        rb_check_pages(cpu_buffer);
-        atomic_dec(&cpu_buffer->record_disabled);
+out:
+        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 static void
@@ -1224,26 +1234,22 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        struct list_head *p;
        unsigned i;
-        atomic_inc(&cpu_buffer->record_disabled);
-        synchronize_sched();
        spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
-                        return;
+                        goto out;
                p = pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
                list_add_tail(&bpage->list, cpu_buffer->pages);
        }
        rb_reset_cpu(cpu_buffer);
-        spin_unlock_irq(&cpu_buffer->reader_lock);
        rb_check_pages(cpu_buffer);
-        atomic_dec(&cpu_buffer->record_disabled);
+out:
+        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 /**
@@ -1251,11 +1257,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
 * @buffer: the buffer to resize.
 * @size: the new size.
 *
- * The tracer is responsible for making sure that the buffer is
- * not being used while changing the size.
- * Note: We may be able to change the above requirement by using
- *  RCU synchronizations.
- *
 * Minimum size is 2 * BUF_PAGE_SIZE.
 *
 * Returns -1 on failure.
@@ -1287,6 +1288,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        if (size == buffer_size)
                return size;
+        atomic_inc(&buffer->record_disabled);
+        /* Make sure all writers are done with this buffer. */
+        synchronize_sched();
        mutex_lock(&buffer->mutex);
        get_online_cpus();
@@ -1349,6 +1355,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        put_online_cpus();
        mutex_unlock(&buffer->mutex);
+        atomic_dec(&buffer->record_disabled);
        return size;
 free_pages:
@@ -1358,6 +1366,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        }
        put_online_cpus();
        mutex_unlock(&buffer->mutex);
+        atomic_dec(&buffer->record_disabled);
        return -ENOMEM;
        /*
@@ -1367,6 +1376,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 out_fail:
        put_online_cpus();
        mutex_unlock(&buffer->mutex);
+        atomic_dec(&buffer->record_disabled);
        return -1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -1548,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event,
        case 0:
                length -= RB_EVNT_HDR_SIZE;
-                if (length > RB_MAX_SMALL_DATA)
+                if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                        event->array[0] = length;
                else
                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1723,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length)
        if (!length)
                length = 1;
-        if (length > RB_MAX_SMALL_DATA)
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                length += sizeof(event.array[0]);
        length += RB_EVNT_HDR_SIZE;
-        length = ALIGN(length, RB_ALIGNMENT);
+        length = ALIGN(length, RB_ARCH_ALIGNMENT);
        return length;
 }
@@ -1787,9 +1797,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 static struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
             unsigned long length, unsigned long tail,
-             struct buffer_page *commit_page,
             struct buffer_page *tail_page, u64 *ts)
 {
+        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
        struct buffer_page *next_page;
        int ret;
@@ -1892,13 +1902,10 @@ static struct ring_buffer_event *
 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                  unsigned type, unsigned long length, u64 *ts)
 {
-        struct buffer_page *tail_page, *commit_page;
+        struct buffer_page *tail_page;
        struct ring_buffer_event *event;
        unsigned long tail, write;
-        commit_page = cpu_buffer->commit_page;
-        /* we just need to protect against interrupts */
-        barrier();
        tail_page = cpu_buffer->tail_page;
        write = local_add_return(length, &tail_page->write);
@@ -1909,7 +1916,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        /* See if we shot pass the end of this buffer page */
        if (write > BUF_PAGE_SIZE)
                return rb_move_tail(cpu_buffer, length, tail,
-                                    commit_page, tail_page, ts);
+                                    tail_page, ts);
        /* We reserved something on the buffer */
@@ -2237,12 +2244,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return NULL;
-        if (atomic_read(&buffer->record_disabled))
-                return NULL;
        /* If we are tracing schedule, we don't want to recurse */
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out_nocheck;
        if (trace_recursive_lock())
                goto out_nocheck;
@@ -2474,11 +2481,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return -EBUSY;
-        if (atomic_read(&buffer->record_disabled))
-                return -EBUSY;
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out;
        cpu = raw_smp_processor_id();
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2546,7 +2553,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
 * @buffer: The ring buffer to enable writes
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable(struct ring_buffer *buffer)
 {
@@ -2582,7 +2589,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 * @cpu: The CPU to enable.
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
@@ -2723,6 +2730,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
+        iter->cache_reader_page = cpu_buffer->reader_page;
+        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -2834,7 +2843,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        int ret;
        local_irq_save(flags);
-        __raw_spin_lock(&cpu_buffer->lock);
+        arch_spin_lock(&cpu_buffer->lock);
 again:
        /*
@@ -2876,7 +2885,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         * Splice the empty reader page into the list around the head.
         */
        reader = rb_set_head_page(cpu_buffer);
-        cpu_buffer->reader_page->list.next = reader->list.next;
+        cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
        cpu_buffer->reader_page->list.prev = reader->list.prev;
        /*
@@ -2913,7 +2922,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         *
         * Now make the new head point back to the reader page.
         */
-        reader->list.next->prev = &cpu_buffer->reader_page->list;
+        rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
        rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
        /* Finally update the reader page to the new head */
@@ -2923,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        goto again;
 out:
-        __raw_spin_unlock(&cpu_buffer->lock);
+        arch_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
        return reader;
@@ -3067,13 +3076,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_event *event;
        int nr_loops = 0;
-        if (ring_buffer_iter_empty(iter))
-                return NULL;
        cpu_buffer = iter->cpu_buffer;
        buffer = cpu_buffer->buffer;
+        /*
+         * Check if someone performed a consuming read to
+         * the buffer. A consuming read invalidates the iterator
+         * and we need to reset the iterator in this case.
+         */
+        if (unlikely(iter->cache_read != cpu_buffer->read ||
+                     iter->cache_reader_page != cpu_buffer->reader_page))
+                rb_iter_reset(iter);
 again:
+        if (ring_buffer_iter_empty(iter))
+                return NULL;
        /*
         * We repeat when a timestamp is encountered.
         * We can get multiple timestamps by nested interrupts or also
@@ -3088,6 +3106,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
+        if (iter->head >= local_read(&iter->head_page->page->commit)) {
+                rb_inc_iter(iter);
+                goto again;
+        }
        event = rb_iter_head_event(iter);
        switch (event->type_len) {
@@ -3286,9 +3309,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
        synchronize_sched();
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-        __raw_spin_lock(&cpu_buffer->lock);
+        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
-        __raw_spin_unlock(&cpu_buffer->lock);
+        arch_spin_unlock(&cpu_buffer->lock);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        return iter;
@@ -3408,11 +3431,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
                goto out;
-        __raw_spin_lock(&cpu_buffer->lock);
+        arch_spin_lock(&cpu_buffer->lock);
        rb_reset_cpu(cpu_buffer);
-        __raw_spin_unlock(&cpu_buffer->lock);
+        arch_spin_unlock(&cpu_buffer->lock);
 out:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 573d3cc762c3..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/time.h>
+#include <asm/local.h>
 struct rb_page {
        u64             ts;
@@ -35,6 +36,28 @@ static int disable_reader;
 module_param(disable_reader, uint, 0644);
 MODULE_PARM_DESC(disable_reader, "only run producer");
+static int write_iteration = 50;
+module_param(write_iteration, uint, 0644);
+MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
+static int producer_nice = 19;
+static int consumer_nice = 19;
+static int producer_fifo = -1;
+static int consumer_fifo = -1;
+module_param(producer_nice, uint, 0644);
+MODULE_PARM_DESC(producer_nice, "nice prio for producer");
+module_param(consumer_nice, uint, 0644);
+MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
+module_param(producer_fifo, uint, 0644);
+MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
+module_param(consumer_fifo, uint, 0644);
+MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
 static int read_events;
 static int kill_test;
@@ -208,15 +231,18 @@ static void ring_buffer_producer(void)
        do {
                struct ring_buffer_event *event;
                int *entry;
+                int i;
-                event = ring_buffer_lock_reserve(buffer, 10);
-                if (!event) {
+                for (i = 0; i < write_iteration; i++) {
-                        missed++;
+                        event = ring_buffer_lock_reserve(buffer, 10);
-                } else {
+                        if (!event) {
-                        hit++;
+                                missed++;
-                        entry = ring_buffer_event_data(event);
+                        } else {
-                        *entry = smp_processor_id();
+                                hit++;
-                        ring_buffer_unlock_commit(buffer, event);
+                                entry = ring_buffer_event_data(event);
+                                *entry = smp_processor_id();
+                                ring_buffer_unlock_commit(buffer, event);
+                        }
                }
                do_gettimeofday(&end_tv);
@@ -263,6 +289,27 @@ static void ring_buffer_producer(void)
        if (kill_test)
                trace_printk("ERROR!\n");
+        if (!disable_reader) {
+                if (consumer_fifo < 0)
+                        trace_printk("Running Consumer at nice: %d\n",
+                                     consumer_nice);
+                else
+                        trace_printk("Running Consumer at SCHED_FIFO %d\n",
+                                     consumer_fifo);
+        }
+        if (producer_fifo < 0)
+                trace_printk("Running Producer at nice: %d\n",
+                             producer_nice);
+        else
+                trace_printk("Running Producer at SCHED_FIFO %d\n",
+                             producer_fifo);
+        /* Let the user know that the test is running at low priority */
+        if (producer_fifo < 0 && consumer_fifo < 0 &&
+            producer_nice == 19 && consumer_nice == 19)
+                trace_printk("WARNING!!! This test is running at lowest priority.\n");
        trace_printk("Time:     %lld (usecs)\n", time);
        trace_printk("Overruns: %lld\n", overruns);
        if (disable_reader)
@@ -392,6 +439,27 @@ static int __init ring_buffer_benchmark_init(void)
        if (IS_ERR(producer))
                goto out_kill;
+        /*
+         * Run them as low-prio background tasks by default:
+         */
+        if (!disable_reader) {
+                if (consumer_fifo >= 0) {
+                        struct sched_param param = {
+                                .sched_priority = consumer_fifo
+                        };
+                        sched_setscheduler(consumer, SCHED_FIFO, &param);
+                } else
+                        set_user_nice(consumer, consumer_nice);
+        }
+        if (producer_fifo >= 0) {
+                struct sched_param param = {
+                        .sched_priority = consumer_fifo
+                };
+                sched_setscheduler(producer, SCHED_FIFO, &param);
+        } else
+                set_user_nice(producer, producer_nice);
        return 0;
 out_kill:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b20d3ec75de9..44f916a04065 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
 *  Copyright (C) 2004 William Lee Irwin III
 */
 #include <linux/ring_buffer.h>
-#include <linux/utsrelease.h>
+#include <generated/utsrelease.h>
 #include <linux/stacktrace.h>
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
@@ -32,10 +32,11 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-#include <linux/gfp.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -86,25 +87,22 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
 */
 static int tracing_disabled = 1;
-DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+DEFINE_PER_CPU(int, ftrace_cpu_disabled);
 static inline void ftrace_disable_cpu(void)
 {
        preempt_disable();
-        local_inc(&__get_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_inc(ftrace_cpu_disabled);
 }
 static inline void ftrace_enable_cpu(void)
 {
-        local_dec(&__get_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_dec(ftrace_cpu_disabled);
        preempt_enable();
 }
 static cpumask_var_t __read_mostly      tracing_buffer_mask;
-/* Define which cpu buffers are currently read in trace_pipe */
-static cpumask_var_t                    tracing_reader_cpumask;
 #define for_each_tracing_cpu(cpu)       \
        for_each_cpu(cpu, tracing_buffer_mask)
@@ -129,7 +127,7 @@ static int tracing_set_tracer(const char *buf);
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
 static char *default_bootup_tracer;
-static int __init set_ftrace(char *str)
+static int __init set_cmdline_ftrace(char *str)
 {
        strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
        default_bootup_tracer = bootup_tracer_buf;
@@ -137,7 +135,7 @@ static int __init set_ftrace(char *str)
        ring_buffer_expanded = 1;
        return 1;
 }
-__setup("ftrace=", set_ftrace);
+__setup("ftrace=", set_cmdline_ftrace);
 static int __init set_ftrace_dump_on_oops(char *str)
 {
@@ -203,7 +201,7 @@ cycle_t ftrace_now(int cpu)
 */
 static struct trace_array       max_tr;
-static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
 /* tracer_enabled is used to toggle activation of a tracer */
 static int                      tracer_enabled = 1;
@@ -243,12 +241,91 @@ static struct tracer		*current_trace __read_mostly;
 /*
 * trace_types_lock is used to protect the trace_types list.
- * This lock is also used to keep user access serialized.
- * Accesses from userspace will grab this lock while userspace
- * activities happen inside the kernel.
 */
 static DEFINE_MUTEX(trace_types_lock);
+/*
+ * serialize the access of the ring buffer
+ *
+ * ring buffer serializes readers, but it is low level protection.
+ * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ * are not protected by ring buffer.
+ *
+ * The content of events may become garbage if we allow other process consumes
+ * these events concurrently:
+ *   A) the page of the consumed events may become a normal page
+ *      (not reader page) in ring buffer, and this page will be rewrited
+ *      by events producer.
+ *   B) The page of the consumed events may become a page for splice_read,
+ *      and this page will be returned to system.
+ *
+ * These primitives allow multi process access to different cpu ring buffer
+ * concurrently.
+ *
+ * These primitives don't distinguish read-only and read-consume access.
+ * Multi read-only access are also serialized.
+ */
+#ifdef CONFIG_SMP
+static DECLARE_RWSEM(all_cpu_access_lock);
+static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                /* gain it for accessing the whole ring buffer. */
+                down_write(&all_cpu_access_lock);
+        } else {
+                /* gain it for accessing a cpu ring buffer. */
+                /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+                down_read(&all_cpu_access_lock);
+                /* Secondly block other access to this @cpu ring buffer. */
+                mutex_lock(&per_cpu(cpu_access_lock, cpu));
+        }
+}
+static inline void trace_access_unlock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                up_write(&all_cpu_access_lock);
+        } else {
+                mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+                up_read(&all_cpu_access_lock);
+        }
+}
+static inline void trace_access_lock_init(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                mutex_init(&per_cpu(cpu_access_lock, cpu));
+}
+#else
+static DEFINE_MUTEX(access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        (void)cpu;
+        mutex_lock(&access_lock);
+}
+static inline void trace_access_unlock(int cpu)
+{
+        (void)cpu;
+        mutex_unlock(&access_lock);
+}
+static inline void trace_access_lock_init(void)
+{
+}
+#endif
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
@@ -297,6 +374,21 @@ static int __init set_buf_size(char *str)
 }
 __setup("trace_buf_size=", set_buf_size);
+static int __init set_tracing_thresh(char *str)
+{
+        unsigned long threshhold;
+        int ret;
+        if (!str)
+                return 0;
+        ret = strict_strtoul(str, 0, &threshhold);
+        if (ret < 0)
+                return 0;
+        tracing_thresh = threshhold * 1000;
+        return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
 unsigned long nsecs_to_usecs(unsigned long nsecs)
 {
        return nsecs / 1000;
@@ -313,7 +405,6 @@ static const char *trace_options[] = {
        "bin",
        "block",
        "stacktrace",
-        "sched-tree",
        "trace_printk",
        "ftrace_preempt",
        "branch",
@@ -493,19 +584,20 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 * protected by per_cpu spinlocks. But the action of the swap
 * needs its own lock.
 *
- * This is defined as a raw_spinlock_t in order to help
+ * This is defined as a arch_spinlock_t in order to help
 * with performance when lockdep debugging is enabled.
 *
 * It is also used in other places outside the update_max_tr
 * so it needs to be defined outside of the
 * CONFIG_TRACER_MAX_TRACE.
 */
-static raw_spinlock_t ftrace_max_lock =
+static arch_spinlock_t ftrace_max_lock =
-        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+unsigned long __read_mostly     tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 unsigned long __read_mostly     tracing_max_latency;
-unsigned long __read_mostly     tracing_thresh;
 /*
 * Copy the new maximum trace into the separate maximum-trace
@@ -516,7 +608,7 @@ static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
        struct trace_array_cpu *data = tr->data[cpu];
-        struct trace_array_cpu *max_data = tr->data[cpu];
+        struct trace_array_cpu *max_data;
        max_tr.cpu = cpu;
        max_tr.time_start = data->preempt_timestamp;
@@ -526,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;
-        memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+        memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
        max_data->pid = tsk->pid;
        max_data->uid = task_uid(tsk);
        max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -555,13 +647,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        tr->buffer = max_tr.buffer;
        max_tr.buffer = buf;
        __update_max_tr(tr, tsk, cpu);
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
 }
 /**
@@ -581,7 +673,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        ftrace_disable_cpu();
@@ -603,7 +695,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
        __update_max_tr(tr, tsk, cpu);
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
@@ -748,10 +840,10 @@ out:
        mutex_unlock(&trace_types_lock);
 }
-static void __tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
 {
        ftrace_disable_cpu();
-        ring_buffer_reset_cpu(tr->buffer, cpu);
+        ring_buffer_reset_cpu(buffer, cpu);
        ftrace_enable_cpu();
 }
@@ -763,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
        /* Make sure all commits have finished */
        synchronize_sched();
-        __tracing_reset(tr, cpu);
+        __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -781,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        tr->time_start = ftrace_now(tr->cpu);
        for_each_online_cpu(cpu)
-                __tracing_reset(tr, cpu);
+                __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -802,7 +894,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
 static int cmdline_idx;
-static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 /* temporary disable recording */
 static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -858,6 +950,8 @@ void tracing_start(void)
                goto out;
        }
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
@@ -867,6 +961,8 @@ void tracing_start(void)
        if (buffer)
                ring_buffer_record_enable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
        ftrace_start();
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -888,6 +984,9 @@ void tracing_stop(void)
        if (trace_stop_count++)
                goto out;
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
@@ -896,6 +995,8 @@ void tracing_stop(void)
        if (buffer)
                ring_buffer_record_disable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
@@ -915,7 +1016,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
         * nor do we want to disable interrupts,
         * so if we miss here, then better luck next time.
         */
-        if (!__raw_spin_trylock(&trace_cmdline_lock))
+        if (!arch_spin_trylock(&trace_cmdline_lock))
                return;
        idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +1041,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
        memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
-        __raw_spin_unlock(&trace_cmdline_lock);
+        arch_spin_unlock(&trace_cmdline_lock);
 }
 void trace_find_cmdline(int pid, char comm[])
@@ -952,20 +1053,25 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
+        if (WARN_ON_ONCE(pid < 0)) {
+                strcpy(comm, "<XXX>");
+                return;
+        }
        if (pid > PID_MAX_DEFAULT) {
                strcpy(comm, "<...>");
                return;
        }
        preempt_disable();
-        __raw_spin_lock(&trace_cmdline_lock);
+        arch_spin_lock(&trace_cmdline_lock);
        map = map_pid_to_cmdline[pid];
        if (map != NO_CMDLINE_MAP)
                strcpy(comm, saved_cmdlines[map]);
        else
                strcpy(comm, "<...>");
-        __raw_spin_unlock(&trace_cmdline_lock);
+        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
 }
@@ -1085,7 +1191,7 @@ trace_function(struct trace_array *tr,
        struct ftrace_entry *entry;
        /* If we are reading the ring buffer, don't trace */
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1151,6 +1257,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
        __ftrace_trace_stack(tr->buffer, flags, skip, pc);
 }
+/**
+ * trace_dump_stack - record a stack back trace in the trace buffer
+ */
+void trace_dump_stack(void)
+{
+        unsigned long flags;
+        if (tracing_disabled || tracing_selftest_running)
+                return;
+        local_save_flags(flags);
+        /* skipping 3 traces, seems to get us at the caller of this function */
+        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+}
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
@@ -1162,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
                return;
+        /*
+         * NMIs can not handle page faults, even with fix ups.
+         * The save user stack can (and often does) fault.
+         */
+        if (unlikely(in_nmi()))
+                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
@@ -1251,8 +1380,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 */
 int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
-        static raw_spinlock_t trace_buf_lock =
+        static arch_spinlock_t trace_buf_lock =
-                (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        static u32 trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1412,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        /* Lockdep uses trace_printk for lock tracing */
        local_irq_save(flags);
-        __raw_spin_lock(&trace_buf_lock);
+        arch_spin_lock(&trace_buf_lock);
        len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
        if (len > TRACE_BUF_SIZE || len < 0)
@@ -1300,11 +1429,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        entry->fmt                      = fmt;
        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, flags, 6, pc);
+        }
 out_unlock:
-        __raw_spin_unlock(&trace_buf_lock);
+        arch_spin_unlock(&trace_buf_lock);
        local_irq_restore(flags);
 out:
@@ -1334,7 +1465,7 @@ int trace_array_printk(struct trace_array *tr,
 int trace_array_vprintk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, va_list args)
 {
-        static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+        static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
        static char trace_buf[TRACE_BUF_SIZE];
        struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1491,9 @@ int trace_array_vprintk(struct trace_array *tr,
        pause_graph_tracing();
        raw_local_irq_save(irq_flags);
-        __raw_spin_lock(&trace_buf_lock);
+        arch_spin_lock(&trace_buf_lock);
        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-        len = min(len, TRACE_BUF_SIZE-1);
-        trace_buf[len] = 0;
        size = sizeof(*entry) + len + 1;
        buffer = tr->buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
@@ -1373,15 +1501,17 @@ int trace_array_vprintk(struct trace_array *tr,
        if (!event)
                goto out_unlock;
        entry = ring_buffer_event_data(event);
-        entry->ip                       = ip;
+        entry->ip = ip;
        memcpy(&entry->buf, trace_buf, len);
-        entry->buf[len] = 0;
+        entry->buf[len] = '\0';
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, irq_flags, 6, pc);
+        }
 out_unlock:
-        __raw_spin_unlock(&trace_buf_lock);
+        arch_spin_unlock(&trace_buf_lock);
        raw_local_irq_restore(irq_flags);
        unpause_graph_tracing();
 out:
@@ -1515,6 +1645,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
        int i = (int)*pos;
        void *ent;
+        WARN_ON_ONCE(iter->leftover);
        (*pos)++;
        /* can't go backwards */
@@ -1566,12 +1698,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 }
 /*
- * No necessary locking here. The worst thing which can
- * happen is loosing events consumed at the same time
- * by a trace_pipe reader.
- * Other than that, we don't risk to crash the ring buffer
- * because it serializes the readers.
- *
 * The current tracer is copied to avoid a global locking
 * all around.
 */
@@ -1609,21 +1735,34 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                ftrace_enable_cpu();
+                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;
        } else {
-                l = *pos - 1;
+                /*
-                p = s_next(m, p, &l);
+                 * If we overflowed the seq_file before, then we want
+                 * to just reuse the trace_seq buffer again.
+                 */
+                if (iter->leftover)
+                        p = iter;
+                else {
+                        l = *pos - 1;
+                        p = s_next(m, p, &l);
+                }
        }
        trace_event_read_lock();
+        trace_access_lock(cpu_file);
        return p;
 }
 static void s_stop(struct seq_file *m, void *p)
 {
+        struct trace_iterator *iter = m->private;
        atomic_dec(&trace_record_cmdline_disabled);
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
 }
@@ -1922,6 +2061,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
+        int ret;
        if (iter->ent == NULL) {
                if (iter->tr) {
@@ -1941,9 +2081,27 @@ static int s_show(struct seq_file *m, void *v)
                        if (!(trace_flags & TRACE_ITER_VERBOSE))
                                print_func_help_header(m);
                }
+        } else if (iter->leftover) {
+                /*
+                 * If we filled the seq_file buffer earlier, we
+                 * want to just show it now.
+                 */
+                ret = trace_print_seq(m, &iter->seq);
+                /* ret should this time be zero, but you never know */
+                iter->leftover = ret;
        } else {
                print_trace_line(iter);
-                trace_print_seq(m, &iter->seq);
+                ret = trace_print_seq(m, &iter->seq);
+                /*
+                 * If we overflow the seq_file buffer, then it will
+                 * ask us for this data again at start up.
+                 * Use that instead.
+                 *  ret is 0 if seq_file write succeeded.
+                 *        -1 otherwise.
+                 */
+                iter->leftover = ret;
        }
        return 0;
@@ -2253,7 +2411,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        mutex_lock(&tracing_cpumask_update_lock);
        local_irq_disable();
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                 * Increase/decrease the disabled counter if we are
@@ -2268,7 +2426,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                        atomic_dec(&global_trace.data[cpu]->disabled);
                }
        }
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
        local_irq_enable();
        cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2290,67 +2448,49 @@ static const struct file_operations tracing_cpumask_fops = {
        .write          = tracing_cpumask_write,
 };
-static ssize_t
+static int tracing_trace_options_show(struct seq_file *m, void *v)
-tracing_trace_options_read(struct file *filp, char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
 {
        struct tracer_opt *trace_opts;
        u32 tracer_flags;
-        int len = 0;
-        char *buf;
-        int r = 0;
        int i;
-        /* calculate max size */
-        for (i = 0; trace_options[i]; i++) {
-                len += strlen(trace_options[i]);
-                len += 3; /* "no" and newline */
-        }
        mutex_lock(&trace_types_lock);
        tracer_flags = current_trace->flags->val;
        trace_opts = current_trace->flags->opts;
-        /*
-         * Increase the size with names of options specific
-         * of the current tracer.
-         */
-        for (i = 0; trace_opts[i].name; i++) {
-                len += strlen(trace_opts[i].name);
-                len += 3; /* "no" and newline */
-        }
-        /* +1 for \0 */
-        buf = kmalloc(len + 1, GFP_KERNEL);
-        if (!buf) {
-                mutex_unlock(&trace_types_lock);
-                return -ENOMEM;
-        }
        for (i = 0; trace_options[i]; i++) {
                if (trace_flags & (1 << i))
-                        r += sprintf(buf + r, "%s\n", trace_options[i]);
+                        seq_printf(m, "%s\n", trace_options[i]);
                else
-                        r += sprintf(buf + r, "no%s\n", trace_options[i]);
+                        seq_printf(m, "no%s\n", trace_options[i]);
        }
        for (i = 0; trace_opts[i].name; i++) {
                if (tracer_flags & trace_opts[i].bit)
-                        r += sprintf(buf + r, "%s\n",
+                        seq_printf(m, "%s\n", trace_opts[i].name);
-                                trace_opts[i].name);
                else
-                        r += sprintf(buf + r, "no%s\n",
+                        seq_printf(m, "no%s\n", trace_opts[i].name);
-                                trace_opts[i].name);
        }
        mutex_unlock(&trace_types_lock);
-        WARN_ON(r >= len + 1);
+        return 0;
+}
-        r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+static int __set_tracer_option(struct tracer *trace,
+                               struct tracer_flags *tracer_flags,
+                               struct tracer_opt *opts, int neg)
+{
+        int ret;
-        kfree(buf);
+        ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
-        return r;
+        if (ret)
+                return ret;
+        if (neg)
+                tracer_flags->val &= ~opts->bit;
+        else
+                tracer_flags->val |= opts->bit;
+        return 0;
 }
 /* Try to assign a tracer specific option */
@@ -2358,33 +2498,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
 {
        struct tracer_flags *tracer_flags = trace->flags;
        struct tracer_opt *opts = NULL;
-        int ret = 0, i = 0;
+        int i;
-        int len;
        for (i = 0; tracer_flags->opts[i].name; i++) {
                opts = &tracer_flags->opts[i];
-                len = strlen(opts->name);
-                if (strncmp(cmp, opts->name, len) == 0) {
+                if (strcmp(cmp, opts->name) == 0)
-                        ret = trace->set_flag(tracer_flags->val,
+                        return __set_tracer_option(trace, trace->flags,
-                                opts->bit, !neg);
+                                                   opts, neg);
-                        break;
-                }
        }
-        /* Not found */
-        if (!tracer_flags->opts[i].name)
-                return -EINVAL;
-        /* Refused to handle */
-        if (ret)
-                return ret;
-        if (neg)
-                tracer_flags->val &= ~opts->bit;
-        else
-                tracer_flags->val |= opts->bit;
-        return 0;
+        return -EINVAL;
 }
 static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2404,7 +2528,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
 {
        char buf[64];
-        char *cmp = buf;
+        char *cmp;
        int neg = 0;
        int ret;
        int i;
@@ -2416,16 +2540,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                return -EFAULT;
        buf[cnt] = 0;
+        cmp = strstrip(buf);
-        if (strncmp(buf, "no", 2) == 0) {
+        if (strncmp(cmp, "no", 2) == 0) {
                neg = 1;
                cmp += 2;
        }
        for (i = 0; trace_options[i]; i++) {
-                int len = strlen(trace_options[i]);
+                if (strcmp(cmp, trace_options[i]) == 0) {
-                if (strncmp(cmp, trace_options[i], len) == 0) {
                        set_tracer_flags(1 << i, !neg);
                        break;
                }
@@ -2445,9 +2568,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int tracing_trace_options_open(struct inode *inode, struct file *file)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        return single_open(file, tracing_trace_options_show, NULL);
+}
 static const struct file_operations tracing_iter_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_trace_options_open,
-        .read           = tracing_trace_options_read,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
        .write          = tracing_trace_options_write,
 };
@@ -2821,22 +2953,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        mutex_lock(&trace_types_lock);
-        /* We only allow one reader per cpu */
-        if (cpu_file == TRACE_PIPE_ALL_CPU) {
-                if (!cpumask_empty(tracing_reader_cpumask)) {
-                        ret = -EBUSY;
-                        goto out;
-                }
-                cpumask_setall(tracing_reader_cpumask);
-        } else {
-                if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
-                        cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
-                else {
-                        ret = -EBUSY;
-                        goto out;
-                }
-        }
        /* create a buffer to store the information to pass to userspace */
        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
@@ -2892,10 +3008,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        mutex_lock(&trace_types_lock);
-        if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
+        if (iter->trace->pipe_close)
-                cpumask_clear(tracing_reader_cpumask);
+                iter->trace->pipe_close(iter);
-        else
-                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
        mutex_unlock(&trace_types_lock);
@@ -3055,6 +3169,7 @@ waitagain:
        iter->pos = -1;
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        while (find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int len = iter->seq.len;
@@ -3071,6 +3186,7 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        /* Now copy what we have to the user */
@@ -3103,7 +3219,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
        __free_page(spd->pages[idx]);
 }
-static struct pipe_buf_operations tracing_pipe_buf_ops = {
+static const struct pipe_buf_operations tracing_pipe_buf_ops = {
        .can_merge              = 0,
        .map                    = generic_pipe_buf_map,
        .unmap                  = generic_pipe_buf_unmap,
@@ -3196,6 +3312,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        }
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        /* Fill as many pages as possible. */
        for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3219,6 +3336,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                trace_seq_init(&iter->seq);
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        mutex_unlock(&iter->mutex);
@@ -3334,7 +3452,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
 {
        char *buf;
-        char *end;
        if (tracing_disabled)
                return -EINVAL;
@@ -3342,7 +3459,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;
-        buf = kmalloc(cnt + 1, GFP_KERNEL);
+        buf = kmalloc(cnt + 2, GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;
@@ -3350,35 +3467,31 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
                kfree(buf);
                return -EFAULT;
        }
+        if (buf[cnt-1] != '\n') {
+                buf[cnt] = '\n';
+                buf[cnt+1] = '\0';
+        } else
+                buf[cnt] = '\0';
-        /* Cut from the first nil or newline. */
+        cnt = mark_printk("%s", buf);
-        buf[cnt] = '\0';
-        end = strchr(buf, '\n');
-        if (end)
-                *end = '\0';
-        cnt = mark_printk("%s\n", buf);
        kfree(buf);
        *fpos += cnt;
        return cnt;
 }
-static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
+static int tracing_clock_show(struct seq_file *m, void *v)
-                                  size_t cnt, loff_t *ppos)
 {
-        char buf[64];
-        int bufiter = 0;
        int i;
        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
-                bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
+                seq_printf(m,
                        "%s%s%s%s", i ? " " : "",
                        i == trace_clock_id ? "[" : "", trace_clocks[i].name,
                        i == trace_clock_id ? "]" : "");
-        bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
+        seq_putc(m, '\n');
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
+        return 0;
 }
 static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
@@ -3420,6 +3533,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int tracing_clock_open(struct inode *inode, struct file *file)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        return single_open(file, tracing_clock_show, NULL);
+}
 static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
@@ -3458,8 +3578,10 @@ static const struct file_operations tracing_mark_fops = {
 };
 static const struct file_operations trace_clock_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_clock_open,
-        .read           = tracing_clock_read,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
        .write          = tracing_clock_write,
 };
@@ -3516,10 +3638,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        info->read = 0;
+        trace_access_lock(info->cpu);
        ret = ring_buffer_read_page(info->tr->buffer,
                                    &info->spare,
                                    count,
                                    info->cpu, 0);
+        trace_access_unlock(info->cpu);
        if (ret < 0)
                return 0;
@@ -3589,7 +3713,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
 }
 /* Pipe buffer operations for a buffer. */
-static struct pipe_buf_operations buffer_pipe_buf_ops = {
+static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .can_merge              = 0,
        .map                    = generic_pipe_buf_map,
        .unmap                  = generic_pipe_buf_unmap,
@@ -3647,6 +3771,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
+        trace_access_lock(info->cpu);
        entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3694,6 +3819,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        }
+        trace_access_unlock(info->cpu);
        spd.nr_pages = i;
        /* did we read anything? */
@@ -3730,7 +3856,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
-                return ENOMEM;
+                return -ENOMEM;
        trace_seq_init(s);
@@ -3920,39 +4046,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (ret < 0)
                return ret;
-        ret = 0;
+        if (val != 0 && val != 1)
-        switch (val) {
+                return -EINVAL;
-        case 0:
-                /* do nothing if already cleared */
-                if (!(topt->flags->val & topt->opt->bit))
-                        break;
-                mutex_lock(&trace_types_lock);
-                if (current_trace->set_flag)
-                        ret = current_trace->set_flag(topt->flags->val,
-                                                      topt->opt->bit, 0);
-                mutex_unlock(&trace_types_lock);
-                if (ret)
-                        return ret;
-                topt->flags->val &= ~topt->opt->bit;
-                break;
-        case 1:
-                /* do nothing if already set */
-                if (topt->flags->val & topt->opt->bit)
-                        break;
+        if (!!(topt->flags->val & topt->opt->bit) != val) {
                mutex_lock(&trace_types_lock);
-                if (current_trace->set_flag)
+                ret = __set_tracer_option(current_trace, topt->flags,
-                        ret = current_trace->set_flag(topt->flags->val,
+                                          topt->opt, !val);
-                                                      topt->opt->bit, 1);
                mutex_unlock(&trace_types_lock);
                if (ret)
                        return ret;
-                topt->flags->val |= topt->opt->bit;
-                break;
-        default:
-                return -EINVAL;
        }
        *ppos += cnt;
@@ -4153,6 +4256,8 @@ static __init int tracer_init_debugfs(void)
        struct dentry *d_tracer;
        int cpu;
+        trace_access_lock_init();
        d_tracer = tracing_init_dentry();
        trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
        trace_create_file("tracing_max_latency", 0644, d_tracer,
                        &tracing_max_latency, &tracing_max_lat_fops);
+#endif
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &tracing_thresh, &tracing_max_lat_fops);
-#endif
        trace_create_file("README", 0444, d_tracer,
                        NULL, &tracing_readme_fops);
@@ -4279,8 +4384,8 @@ trace_printk_seq(struct trace_seq *s)
 static void __ftrace_dump(bool disable_tracing)
 {
-        static raw_spinlock_t ftrace_dump_lock =
+        static arch_spinlock_t ftrace_dump_lock =
-                (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
        unsigned int old_userobj;
@@ -4290,7 +4395,7 @@ static void __ftrace_dump(bool disable_tracing)
        /* only one dump */
        local_irq_save(flags);
-        __raw_spin_lock(&ftrace_dump_lock);
+        arch_spin_lock(&ftrace_dump_lock);
        if (dump_ran)
                goto out;
@@ -4365,7 +4470,7 @@ static void __ftrace_dump(bool disable_tracing)
        }
 out:
-        __raw_spin_unlock(&ftrace_dump_lock);
+        arch_spin_unlock(&ftrace_dump_lock);
        local_irq_restore(flags);
 }
@@ -4387,9 +4492,6 @@ __init static int tracer_alloc_buffers(void)
        if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;
-        if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
-                goto out_free_tracing_cpumask;
        /* To save memory, keep the ring buffer size to its minimum */
        if (ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
@@ -4426,7 +4528,7 @@ __init static int tracer_alloc_buffers(void)
        /* Allocate the first page for all buffers */
        for_each_tracing_cpu(i) {
                global_trace.data[i] = &per_cpu(global_trace_cpu, i);
-                max_tr.data[i] = &per_cpu(max_data, i);
+                max_tr.data[i] = &per_cpu(max_tr_data, i);
        }
        trace_init_cmdlines();
@@ -4447,8 +4549,6 @@ __init static int tracer_alloc_buffers(void)
        return 0;
 out_free_cpumask:
-        free_cpumask_var(tracing_reader_cpumask);
-out_free_tracing_cpumask:
        free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 405cb850b75d..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
+#include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
        TRACE_KMEM_ALLOC,
        TRACE_KMEM_FREE,
        TRACE_BLK,
+        TRACE_KSYM,
        __TRACE_LAST_TYPE,
 };
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
 struct syscall_trace_exit {
        struct trace_entry      ent;
        int                     nr;
-        unsigned long           ret;
+        long                    ret;
 };
+struct kprobe_trace_entry {
+        struct trace_entry      ent;
+        unsigned long           ip;
+        int                     nargs;
+        unsigned long           args[];
+};
+#define SIZEOF_KPROBE_TRACE_ENTRY(n)                    \
+        (offsetof(struct kprobe_trace_entry, args) +    \
+        (sizeof(unsigned long) * (n)))
+struct kretprobe_trace_entry {
+        struct trace_entry      ent;
+        unsigned long           func;
+        unsigned long           ret_ip;
+        int                     nargs;
+        unsigned long           args[];
+};
+#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)                 \
+        (offsetof(struct kretprobe_trace_entry, args) + \
+        (sizeof(unsigned long) * (n)))
 /*
 * trace_flag_type is an enumeration that holds different
 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
                          TRACE_KMEM_ALLOC);    \
                IF_ASSIGN(var, ent, struct kmemtrace_free_entry,        \
                          TRACE_KMEM_FREE);     \
+                IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
                __ftrace_bad_type();                                    \
        } while (0)
@@ -246,6 +272,7 @@ struct tracer_flags {
 * @pipe_open: called when the trace_pipe file is opened
 * @wait_pipe: override how the user waits for traces on trace_pipe
 * @close: called when the trace file is released
+ * @pipe_close: called when the trace_pipe file is released
 * @read: override the default read callback on trace_pipe
 * @splice_read: override the default splice_read callback on trace_pipe
 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -264,6 +291,7 @@ struct tracer {
        void                    (*pipe_open)(struct trace_iterator *iter);
        void                    (*wait_pipe)(struct trace_iterator *iter);
        void                    (*close)(struct trace_iterator *iter);
+        void                    (*pipe_close)(struct trace_iterator *iter);
        ssize_t                 (*read)(struct trace_iterator *iter,
                                        struct file *filp, char __user *ubuf,
                                        size_t cnt, loff_t *ppos);
@@ -364,11 +392,14 @@ int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
+extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+extern unsigned long tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 extern unsigned long tracing_max_latency;
-extern unsigned long tracing_thresh;
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
@@ -413,7 +444,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
 extern int ring_buffer_expanded;
 extern bool tracing_selftest_disabled;
-DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
+DECLARE_PER_CPU(int, ftrace_cpu_disabled);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 extern int trace_selftest_startup_function(struct tracer *trace,
@@ -438,6 +469,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
 extern int trace_selftest_startup_hw_branches(struct tracer *trace,
                                              struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+                                         struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 extern void *head_page(struct trace_array_cpu *data);
@@ -465,6 +498,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 #define FTRACE_GRAPH_MAX_FUNCS          32
+extern int ftrace_graph_filter_enabled;
 extern int ftrace_graph_count;
 extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
@@ -472,7 +506,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 {
        int i;
-        if (!ftrace_graph_count || test_tsk_trace_graph(current))
+        if (!ftrace_graph_filter_enabled)
                return 1;
        for (i = 0; i < ftrace_graph_count; i++) {
@@ -483,10 +517,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
        return 0;
 }
 #else
-static inline int ftrace_trace_addr(unsigned long addr)
-{
-        return 1;
-}
 static inline int ftrace_graph_addr(unsigned long addr)
 {
        return 1;
@@ -500,12 +530,12 @@ print_graph_function(struct trace_iterator *iter)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-extern struct pid *ftrace_pid_trace;
+extern struct list_head ftrace_pids;
 #ifdef CONFIG_FUNCTION_TRACER
 static inline int ftrace_trace_task(struct task_struct *task)
 {
-        if (!ftrace_pid_trace)
+        if (list_empty(&ftrace_pids))
                return 1;
        return test_tsk_trace_trace(task);
@@ -521,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
 * @buffer: holds the parsed user input
- * @idx: user input lenght
+ * @idx: user input length
 * @size: buffer size
 */
 struct trace_parser {
@@ -569,18 +599,17 @@ enum trace_iterator_flags {
        TRACE_ITER_BIN                  = 0x40,
        TRACE_ITER_BLOCK                = 0x80,
        TRACE_ITER_STACKTRACE           = 0x100,
-        TRACE_ITER_SCHED_TREE           = 0x200,
+        TRACE_ITER_PRINTK               = 0x200,
-        TRACE_ITER_PRINTK               = 0x400,
+        TRACE_ITER_PREEMPTONLY          = 0x400,
-        TRACE_ITER_PREEMPTONLY          = 0x800,
+        TRACE_ITER_BRANCH               = 0x800,
-        TRACE_ITER_BRANCH               = 0x1000,
+        TRACE_ITER_ANNOTATE             = 0x1000,
-        TRACE_ITER_ANNOTATE             = 0x2000,
+        TRACE_ITER_USERSTACKTRACE       = 0x2000,
-        TRACE_ITER_USERSTACKTRACE       = 0x4000,
+        TRACE_ITER_SYM_USEROBJ          = 0x4000,
-        TRACE_ITER_SYM_USEROBJ          = 0x8000,
+        TRACE_ITER_PRINTK_MSGONLY       = 0x8000,
-        TRACE_ITER_PRINTK_MSGONLY       = 0x10000,
+        TRACE_ITER_CONTEXT_INFO         = 0x10000, /* Print pid/cpu/time */
-        TRACE_ITER_CONTEXT_INFO         = 0x20000, /* Print pid/cpu/time */
+        TRACE_ITER_LATENCY_FMT          = 0x20000,
-        TRACE_ITER_LATENCY_FMT          = 0x40000,
+        TRACE_ITER_SLEEP_TIME           = 0x40000,
-        TRACE_ITER_SLEEP_TIME           = 0x80000,
+        TRACE_ITER_GRAPH_TIME           = 0x80000,
-        TRACE_ITER_GRAPH_TIME           = 0x100000,
 };
 /*
@@ -687,7 +716,6 @@ struct event_filter {
        int                     n_preds;
        struct filter_pred      **preds;
        char                    *filter_string;
-        bool                    no_reset;
 };
 struct event_subsystem {
@@ -699,22 +727,40 @@ struct event_subsystem {
 };
 struct filter_pred;
+struct regex;
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
                                 int val1, int val2);
+typedef int (*regex_match_func)(char *str, struct regex *r, int len);
+enum regex_type {
+        MATCH_FULL = 0,
+        MATCH_FRONT_ONLY,
+        MATCH_MIDDLE_ONLY,
+        MATCH_END_ONLY,
+};
+struct regex {
+        char                    pattern[MAX_FILTER_STR_VAL];
+        int                     len;
+        int                     field_len;
+        regex_match_func        match;
+};
 struct filter_pred {
-        filter_pred_fn_t fn;
+        filter_pred_fn_t        fn;
-        u64 val;
+        u64                     val;
-        char str_val[MAX_FILTER_STR_VAL];
+        struct regex            regex;
-        int str_len;
+        char                    *field_name;
-        char *field_name;
+        int                     offset;
-        int offset;
+        int                     not;
-        int not;
+        int                     op;
-        int op;
+        int                     pop_n;
-        int pop_n;
 };
+extern enum regex_type
+filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct ftrace_event_call *call,
                               struct trace_seq *s);
 extern int apply_event_filter(struct ftrace_event_call *call,
@@ -730,7 +776,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
                     struct ring_buffer *buffer,
                     struct ring_buffer_event *event)
 {
-        if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+        if (unlikely(call->filter_active) &&
+            !filter_match_preds(call->filter, rec)) {
                ring_buffer_discard_commit(buffer, event);
                return 1;
        }
@@ -746,7 +793,8 @@ extern const char *__stop___trace_bprintk_fmt[];
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print)             \
-        extern struct ftrace_event_call event_##call;
+        extern struct ftrace_event_call                                 \
+        __attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)         \
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
                return -1;
        if (percent_a > percent_b)
                return 1;
-        else
-                return 0;
+        if (a->incorrect < b->incorrect)
+                return -1;
+        if (a->incorrect > b->incorrect)
+                return 1;
+        /*
+         * Since the above shows worse (incorrect) cases
+         * first, we continue that by showing best (correct)
+         * cases last.
+         */
+        if (a->correct > b->correct)
+                return -1;
+        if (a->correct < b->correct)
+                return 1;
+        return 0;
 }
 static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 20c5f92e28a8..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
 * Tracer plugins will chose a default from these clocks.
 */
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -20,6 +21,8 @@
 #include <linux/ktime.h>
 #include <linux/trace_clock.h>
+#include "trace.h"
 /*
 * trace_clock_local(): the simplest and least coherent tracing clock.
 *
@@ -28,17 +31,17 @@
 */
 u64 notrace trace_clock_local(void)
 {
-        unsigned long flags;
        u64 clock;
+        int resched;
        /*
         * sched_clock() is an architecture implemented, fast, scalable,
         * lockless clock. It is not guaranteed to be coherent across
         * CPUs, nor across CPU idle events.
         */
-        raw_local_irq_save(flags);
+        resched = ftrace_preempt_disable();
        clock = sched_clock();
-        raw_local_irq_restore(flags);
+        ftrace_preempt_enable(resched);
        return clock;
 }
@@ -69,10 +72,10 @@ u64 notrace trace_clock(void)
 /* keep prev_time and lock in the same cacheline. */
 static struct {
        u64 prev_time;
-        raw_spinlock_t lock;
+        arch_spinlock_t lock;
 } trace_clock_struct ____cacheline_aligned_in_smp =
        {
-                .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
+                .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
        };
 u64 notrace trace_clock_global(void)
@@ -81,7 +84,7 @@ u64 notrace trace_clock_global(void)
        int this_cpu;
        u64 now;
-        raw_local_irq_save(flags);
+        local_irq_save(flags);
        this_cpu = raw_smp_processor_id();
        now = cpu_clock(this_cpu);
@@ -92,7 +95,7 @@ u64 notrace trace_clock_global(void)
        if (unlikely(in_nmi()))
                goto out;
-        __raw_spin_lock(&trace_clock_struct.lock);
+        arch_spin_lock(&trace_clock_struct.lock);
        /*
         * TODO: if this happens often then maybe we should reset
@@ -104,10 +107,10 @@ u64 notrace trace_clock_global(void)
        trace_clock_struct.prev_time = now;
-        __raw_spin_unlock(&trace_clock_struct.lock);
+        arch_spin_unlock(&trace_clock_struct.lock);
 out:
-        raw_local_irq_restore(flags);
+        local_irq_restore(flags);
        return now;
 }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
        F_printk("type:%u call_site:%lx ptr:%p",
                 __entry->type_id, __entry->call_site, __entry->ptr)
 );
+FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
+        TRACE_KSYM,
+        F_STRUCT(
+                __field(        unsigned long,  ip                        )
+                __field(        unsigned char,  type                      )
+                __array(        char         ,  cmd,       TASK_COMM_LEN  )
+                __field(        unsigned long,  addr                      )
+        ),
+        F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
+                (void *)__entry->ip, (unsigned int)__entry->type,
+                (void *)__entry->addr,  __entry->cmd)
+);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 000000000000..0565bb42566f
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,175 @@
+/*
+ * trace event based perf event profiling/tracing
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include "trace.h"
+DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
+EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+static char *perf_trace_buf;
+static char *perf_trace_buf_nmi;
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+        perf_trace_t;
+/* Count the events in use (per event id, not per instance) */
+static int      total_ref_count;
+static int perf_trace_event_enable(struct ftrace_event_call *event)
+{
+        char *buf;
+        int ret = -ENOMEM;
+        if (event->perf_refcount++ > 0)
+                return 0;
+        if (!total_ref_count) {
+                buf = (char *)alloc_percpu(perf_trace_t);
+                if (!buf)
+                        goto fail_buf;
+                rcu_assign_pointer(perf_trace_buf, buf);
+                buf = (char *)alloc_percpu(perf_trace_t);
+                if (!buf)
+                        goto fail_buf_nmi;
+                rcu_assign_pointer(perf_trace_buf_nmi, buf);
+        }
+        ret = event->perf_event_enable(event);
+        if (!ret) {
+                total_ref_count++;
+                return 0;
+        }
+fail_buf_nmi:
+        if (!total_ref_count) {
+                free_percpu(perf_trace_buf_nmi);
+                free_percpu(perf_trace_buf);
+                perf_trace_buf_nmi = NULL;
+                perf_trace_buf = NULL;
+        }
+fail_buf:
+        event->perf_refcount--;
+        return ret;
+}
+int perf_trace_enable(int event_id)
+{
+        struct ftrace_event_call *event;
+        int ret = -EINVAL;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(event, &ftrace_events, list) {
+                if (event->id == event_id && event->perf_event_enable &&
+                    try_module_get(event->mod)) {
+                        ret = perf_trace_event_enable(event);
+                        break;
+                }
+        }
+        mutex_unlock(&event_mutex);
+        return ret;
+}
+static void perf_trace_event_disable(struct ftrace_event_call *event)
+{
+        char *buf, *nmi_buf;
+        if (--event->perf_refcount > 0)
+                return;
+        event->perf_event_disable(event);
+        if (!--total_ref_count) {
+                buf = perf_trace_buf;
+                rcu_assign_pointer(perf_trace_buf, NULL);
+                nmi_buf = perf_trace_buf_nmi;
+                rcu_assign_pointer(perf_trace_buf_nmi, NULL);
+                /*
+                 * Ensure every events in profiling have finished before
+                 * releasing the buffers
+                 */
+                synchronize_sched();
+                free_percpu(buf);
+                free_percpu(nmi_buf);
+        }
+}
+void perf_trace_disable(int event_id)
+{
+        struct ftrace_event_call *event;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(event, &ftrace_events, list) {
+                if (event->id == event_id) {
+                        perf_trace_event_disable(event);
+                        module_put(event->mod);
+                        break;
+                }
+        }
+        mutex_unlock(&event_mutex);
+}
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+                                       int *rctxp, unsigned long *irq_flags)
+{
+        struct trace_entry *entry;
+        char *trace_buf, *raw_data;
+        int pc, cpu;
+        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
+        pc = preempt_count();
+        /* Protect the per cpu buffer, begin the rcu read side */
+        local_irq_save(*irq_flags);
+        *rctxp = perf_swevent_get_recursion_context();
+        if (*rctxp < 0)
+                goto err_recursion;
+        cpu = smp_processor_id();
+        if (in_nmi())
+                trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
+        else
+                trace_buf = rcu_dereference_sched(perf_trace_buf);
+        if (!trace_buf)
+                goto err;
+        raw_data = per_cpu_ptr(trace_buf, cpu);
+        /* zero the dead bytes from align to not leak stack to user */
+        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+        entry = (struct trace_entry *)raw_data;
+        tracing_generic_entry_update(entry, *irq_flags, pc);
+        entry->type = type;
+        return raw_data;
+err:
+        perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+        local_irq_restore(*irq_flags);
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
deleted file mode 100644
index 8d5c171cc998..000000000000
--- a/kernel/trace/trace_event_profile.c
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * trace event based perf counter profiling
- *
- * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
- */
-#include <linux/module.h>
-#include "trace.h"
-/*
- * We can't use a size but a type in alloc_percpu()
- * So let's create a dummy type that matches the desired size
- */
-typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
-char            *trace_profile_buf;
-EXPORT_SYMBOL_GPL(trace_profile_buf);
-char            *trace_profile_buf_nmi;
-EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
-/* Count the events in use (per event id, not per instance) */
-static int      total_profile_count;
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
-{
-        char *buf;
-        int ret = -ENOMEM;
-        if (atomic_inc_return(&event->profile_count))
-                return 0;
-        if (!total_profile_count) {
-                buf = (char *)alloc_percpu(profile_buf_t);
-                if (!buf)
-                        goto fail_buf;
-                rcu_assign_pointer(trace_profile_buf, buf);
-                buf = (char *)alloc_percpu(profile_buf_t);
-                if (!buf)
-                        goto fail_buf_nmi;
-                rcu_assign_pointer(trace_profile_buf_nmi, buf);
-        }
-        ret = event->profile_enable();
-        if (!ret) {
-                total_profile_count++;
-                return 0;
-        }
-fail_buf_nmi:
-        if (!total_profile_count) {
-                free_percpu(trace_profile_buf_nmi);
-                free_percpu(trace_profile_buf);
-                trace_profile_buf_nmi = NULL;
-                trace_profile_buf = NULL;
-        }
-fail_buf:
-        atomic_dec(&event->profile_count);
-        return ret;
-}
-int ftrace_profile_enable(int event_id)
-{
-        struct ftrace_event_call *event;
-        int ret = -EINVAL;
-        mutex_lock(&event_mutex);
-        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id && event->profile_enable &&
-                    try_module_get(event->mod)) {
-                        ret = ftrace_profile_enable_event(event);
-                        break;
-                }
-        }
-        mutex_unlock(&event_mutex);
-        return ret;
-}
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
-{
-        char *buf, *nmi_buf;
-        if (!atomic_add_negative(-1, &event->profile_count))
-                return;
-        event->profile_disable();
-        if (!--total_profile_count) {
-                buf = trace_profile_buf;
-                rcu_assign_pointer(trace_profile_buf, NULL);
-                nmi_buf = trace_profile_buf_nmi;
-                rcu_assign_pointer(trace_profile_buf_nmi, NULL);
-                /*
-                 * Ensure every events in profiling have finished before
-                 * releasing the buffers
-                 */
-                synchronize_sched();
-                free_percpu(buf);
-                free_percpu(nmi_buf);
-        }
-}
-void ftrace_profile_disable(int event_id)
-{
-        struct ftrace_event_call *event;
-        mutex_lock(&event_mutex);
-        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id) {
-                        ftrace_profile_disable_event(event);
-                        module_put(event->mod);
-                        break;
-                }
-        }
-        mutex_unlock(&event_mutex);
-}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d128f65778e6..c697c7043349 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <linux/delay.h>
 #include <asm/setup.h>
@@ -60,10 +61,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
        return 0;
 err:
-        if (field) {
+        if (field)
                kfree(field->name);
-                kfree(field->type);
-        }
        kfree(field);
        return -ENOMEM;
@@ -78,7 +77,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
        if (ret)                                                        \
                return ret;
-int trace_define_common_fields(struct ftrace_event_call *call)
+static int trace_define_common_fields(struct ftrace_event_call *call)
 {
        int ret;
        struct trace_entry ent;
@@ -91,11 +90,8 @@ int trace_define_common_fields(struct ftrace_event_call *call)
        return ret;
 }
-EXPORT_SYMBOL_GPL(trace_define_common_fields);
-#ifdef CONFIG_MODULES
-static void trace_destroy_fields(struct ftrace_event_call *call)
+void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
@@ -107,27 +103,49 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
        }
 }
-#endif /* CONFIG_MODULES */
+int trace_event_raw_init(struct ftrace_event_call *call)
+{
+        int id;
-static void ftrace_event_enable_disable(struct ftrace_event_call *call,
+        id = register_ftrace_event(call->event);
+        if (!id)
+                return -ENODEV;
+        call->id = id;
+        INIT_LIST_HEAD(&call->fields);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(trace_event_raw_init);
+static int ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
+        int ret = 0;
        switch (enable) {
        case 0:
                if (call->enabled) {
                        call->enabled = 0;
                        tracing_stop_cmdline_record();
-                        call->unregfunc(call->data);
+                        call->unregfunc(call);
                }
                break;
        case 1:
                if (!call->enabled) {
-                        call->enabled = 1;
                        tracing_start_cmdline_record();
-                        call->regfunc(call->data);
+                        ret = call->regfunc(call);
+                        if (ret) {
+                                tracing_stop_cmdline_record();
+                                pr_info("event trace: Could not enable event "
+                                        "%s\n", call->name);
+                                break;
+                        }
+                        call->enabled = 1;
                }
                break;
        }
+        return ret;
 }
 static void ftrace_clear_events(void)
@@ -406,7 +424,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        case 0:
        case 1:
                mutex_lock(&event_mutex);
-                ftrace_event_enable_disable(call, val);
+                ret = ftrace_event_enable_disable(call, val);
                mutex_unlock(&event_mutex);
                break;
@@ -416,7 +434,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        *ppos += cnt;
-        return cnt;
+        return ret ? ret : cnt;
 }
 static ssize_t
@@ -501,41 +519,16 @@ out:
        return ret;
 }
-extern char *__bad_type_size(void);
-#undef FIELD
-#define FIELD(type, name)                                               \
-        sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
-        #type, "common_" #name, offsetof(typeof(field), name),          \
-                sizeof(field.name)
-static int trace_write_header(struct trace_seq *s)
-{
-        struct trace_entry field;
-        /* struct trace_entry */
-        return trace_seq_printf(s,
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-                                "\n",
-                                FIELD(unsigned short, type),
-                                FIELD(unsigned char, flags),
-                                FIELD(unsigned char, preempt_count),
-                                FIELD(int, pid),
-                                FIELD(int, lock_depth));
-}
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_field *field;
        struct trace_seq *s;
+        int common_field_count = 5;
        char *buf;
-        int r;
+        int r = 0;
        if (*ppos)
                return 0;
@@ -546,14 +539,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
-        /* If any of the first writes fail, so will the show_format. */
        trace_seq_printf(s, "name: %s\n", call->name);
        trace_seq_printf(s, "ID: %d\n", call->id);
        trace_seq_printf(s, "format:\n");
-        trace_write_header(s);
-        r = call->show_format(call, s);
+        list_for_each_entry_reverse(field, &call->fields, link) {
+                /*
+                 * Smartly shows the array type(except dynamic array).
+                 * Normal:
+                 *      field:TYPE VAR
+                 * If TYPE := TYPE[LEN], it is shown:
+                 *      field:TYPE VAR[LEN]
+                 */
+                const char *array_descriptor = strchr(field->type, '[');
+                if (!strncmp(field->type, "__data_loc", 10))
+                        array_descriptor = NULL;
+                if (!array_descriptor) {
+                        r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        field->type, field->name, field->offset,
+                                        field->size, !!field->is_signed);
+                } else {
+                        r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        (int)(array_descriptor - field->type),
+                                        field->type, field->name,
+                                        array_descriptor, field->offset,
+                                        field->size, !!field->is_signed);
+                }
+                if (--common_field_count == 0)
+                        r = trace_seq_printf(s, "\n");
+                if (!r)
+                        break;
+        }
+        if (r)
+                r = trace_seq_printf(s, "\nprint fmt: %s\n",
+                                call->print_fmt);
        if (!r) {
                /*
                 * ug!  The format output is bigger than a PAGE!!
@@ -878,9 +905,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
                           "'%s/filter' entry\n", name);
        }
-        entry = trace_create_file("enable", 0644, system->entry,
+        trace_create_file("enable", 0644, system->entry,
-                                  (void *)system->name,
+                          (void *)system->name,
-                                  &ftrace_system_enable_fops);
+                          &ftrace_system_enable_fops);
        return system->entry;
 }
@@ -892,7 +919,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                 const struct file_operations *filter,
                 const struct file_operations *format)
 {
-        struct dentry *entry;
        int ret;
        /*
@@ -910,55 +936,72 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
        }
        if (call->regfunc)
-                entry = trace_create_file("enable", 0644, call->dir, call,
+                trace_create_file("enable", 0644, call->dir, call,
-                                          enable);
+                                  enable);
-        if (call->id && call->profile_enable)
+        if (call->id && call->perf_event_enable)
-                entry = trace_create_file("id", 0444, call->dir, call,
+                trace_create_file("id", 0444, call->dir, call,
-                                          id);
+                                  id);
        if (call->define_fields) {
-                ret = call->define_fields(call);
+                ret = trace_define_common_fields(call);
+                if (!ret)
+                        ret = call->define_fields(call);
                if (ret < 0) {
                        pr_warning("Could not initialize trace point"
                                   " events/%s\n", call->name);
                        return ret;
                }
-                entry = trace_create_file("filter", 0644, call->dir, call,
+                trace_create_file("filter", 0644, call->dir, call,
-                                          filter);
+                                  filter);
        }
-        /* A trace may not want to export its format */
+        trace_create_file("format", 0444, call->dir, call,
-        if (!call->show_format)
+                          format);
-                return 0;
-        entry = trace_create_file("format", 0444, call->dir, call,
-                                  format);
        return 0;
 }
-#define for_each_event(event, start, end)                       \
+static int __trace_add_event_call(struct ftrace_event_call *call)
-        for (event = start;                                     \
+{
-             (unsigned long)event < (unsigned long)end;         \
+        struct dentry *d_events;
-             event++)
+        int ret;
-#ifdef CONFIG_MODULES
+        if (!call->name)
+                return -EINVAL;
-static LIST_HEAD(ftrace_module_file_list);
+        if (call->raw_init) {
+                ret = call->raw_init(call);
+                if (ret < 0) {
+                        if (ret != -ENOSYS)
+                                pr_warning("Could not initialize trace "
+                                "events/%s\n", call->name);
+                        return ret;
+                }
+        }
-/*
+        d_events = event_trace_events_dir();
- * Modules must own their file_operations to keep up with
+        if (!d_events)
- * reference counting.
+                return -ENOENT;
- */
-struct ftrace_module_file_ops {
+        ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
-        struct list_head                list;
+                                &ftrace_enable_fops, &ftrace_event_filter_fops,
-        struct module                   *mod;
+                                &ftrace_event_format_fops);
-        struct file_operations          id;
+        if (!ret)
-        struct file_operations          enable;
+                list_add(&call->list, &ftrace_events);
-        struct file_operations          format;
-        struct file_operations          filter;
+        return ret;
-};
+}
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+        int ret;
+        mutex_lock(&event_mutex);
+        ret = __trace_add_event_call(call);
+        mutex_unlock(&event_mutex);
+        return ret;
+}
 static void remove_subsystem_dir(const char *name)
 {
@@ -986,6 +1029,53 @@ static void remove_subsystem_dir(const char *name)
        }
 }
+/*
+ * Must be called under locking both of event_mutex and trace_event_mutex.
+ */
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+        ftrace_event_enable_disable(call, 0);
+        if (call->event)
+                __unregister_ftrace_event(call->event);
+        debugfs_remove_recursive(call->dir);
+        list_del(&call->list);
+        trace_destroy_fields(call);
+        destroy_preds(call);
+        remove_subsystem_dir(call->system);
+}
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+        mutex_lock(&event_mutex);
+        down_write(&trace_event_mutex);
+        __trace_remove_event_call(call);
+        up_write(&trace_event_mutex);
+        mutex_unlock(&event_mutex);
+}
+#define for_each_event(event, start, end)                       \
+        for (event = start;                                     \
+             (unsigned long)event < (unsigned long)end;         \
+             event++)
+#ifdef CONFIG_MODULES
+static LIST_HEAD(ftrace_module_file_list);
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+        struct list_head                list;
+        struct module                   *mod;
+        struct file_operations          id;
+        struct file_operations          enable;
+        struct file_operations          format;
+        struct file_operations          filter;
+};
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1043,7 +1133,7 @@ static void trace_module_add_events(struct module *mod)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                        ret = call->raw_init();
+                        ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1061,10 +1151,11 @@ static void trace_module_add_events(struct module *mod)
                                return;
                }
                call->mod = mod;
-                list_add(&call->list, &ftrace_events);
+                ret = event_create_dir(call, d_events,
-                event_create_dir(call, d_events,
+                                       &file_ops->id, &file_ops->enable,
-                                 &file_ops->id, &file_ops->enable,
+                                       &file_ops->filter, &file_ops->format);
-                                 &file_ops->filter, &file_ops->format);
+                if (!ret)
+                        list_add(&call->list, &ftrace_events);
        }
 }
@@ -1078,14 +1169,7 @@ static void trace_module_remove_events(struct module *mod)
        list_for_each_entry_safe(call, p, &ftrace_events, list) {
                if (call->mod == mod) {
                        found = true;
-                        ftrace_event_enable_disable(call, 0);
+                        __trace_remove_event_call(call);
-                        if (call->event)
-                                __unregister_ftrace_event(call->event);
-                        debugfs_remove_recursive(call->dir);
-                        list_del(&call->list);
-                        trace_destroy_fields(call);
-                        destroy_preds(call);
-                        remove_subsystem_dir(call->system);
                }
        }
@@ -1203,7 +1287,7 @@ static __init int event_trace_init(void)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                        ret = call->raw_init();
+                        ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1211,10 +1295,12 @@ static __init int event_trace_init(void)
                                continue;
                        }
                }
-                list_add(&call->list, &ftrace_events);
+                ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
-                event_create_dir(call, d_events, &ftrace_event_id_fops,
+                                       &ftrace_enable_fops,
-                                 &ftrace_enable_fops, &ftrace_event_filter_fops,
+                                       &ftrace_event_filter_fops,
-                                 &ftrace_event_format_fops);
+                                       &ftrace_event_format_fops);
+                if (!ret)
+                        list_add(&call->list, &ftrace_events);
        }
        while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 98a6cc5c64ed..88c0b6dbd7fe 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,11 @@
 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
 */
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/mutex.h>
+#include <linux/perf_event.h>
+#include <linux/slab.h>
 #include "trace.h"
 #include "trace_output.h"
@@ -31,6 +31,7 @@ enum filter_op_ids
 {
        OP_OR,
        OP_AND,
+        OP_GLOB,
        OP_NE,
        OP_EQ,
        OP_LT,
@@ -48,16 +49,17 @@ struct filter_op {
 };
 static struct filter_op filter_ops[] = {
-        { OP_OR, "||", 1 },
+        { OP_OR,        "||",           1 },
-        { OP_AND, "&&", 2 },
+        { OP_AND,       "&&",           2 },
-        { OP_NE, "!=", 4 },
+        { OP_GLOB,      "~",            4 },
-        { OP_EQ, "==", 4 },
+        { OP_NE,        "!=",           4 },
-        { OP_LT, "<", 5 },
+        { OP_EQ,        "==",           4 },
-        { OP_LE, "<=", 5 },
+        { OP_LT,        "<",            5 },
-        { OP_GT, ">", 5 },
+        { OP_LE,        "<=",           5 },
-        { OP_GE, ">=", 5 },
+        { OP_GT,        ">",            5 },
-        { OP_NONE, "OP_NONE", 0 },
+        { OP_GE,        ">=",           5 },
-        { OP_OPEN_PAREN, "(", 0 },
+        { OP_NONE,      "OP_NONE",      0 },
+        { OP_OPEN_PAREN, "(",           0 },
 };
 enum {
@@ -197,9 +199,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
        char *addr = (char *)(event + pred->offset);
        int cmp, match;
-        cmp = strncmp(addr, pred->str_val, pred->str_len);
+        cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
-        match = (!cmp) ^ pred->not;
+        match = cmp ^ pred->not;
        return match;
 }
@@ -210,10 +212,11 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
 {
        char **addr = (char **)(event + pred->offset);
        int cmp, match;
+        int len = strlen(*addr) + 1;    /* including tailing '\0' */
-        cmp = strncmp(*addr, pred->str_val, pred->str_len);
+        cmp = pred->regex.match(*addr, &pred->regex, len);
-        match = (!cmp) ^ pred->not;
+        match = cmp ^ pred->not;
        return match;
 }
@@ -237,9 +240,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
        char *addr = (char *)(event + str_loc);
        int cmp, match;
-        cmp = strncmp(addr, pred->str_val, str_len);
+        cmp = pred->regex.match(addr, &pred->regex, str_len);
-        match = (!cmp) ^ pred->not;
+        match = cmp ^ pred->not;
        return match;
 }
@@ -250,10 +253,133 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
        return 0;
 }
+/*
+ * regex_match_foo - Basic regex callbacks
+ *
+ * @str: the string to be searched
+ * @r:   the regex structure containing the pattern string
+ * @len: the length of the string to be searched (including '\0')
+ *
+ * Note:
+ * - @str might not be NULL-terminated if it's of type DYN_STRING
+ *   or STATIC_STRING
+ */
+static int regex_match_full(char *str, struct regex *r, int len)
+{
+        if (strncmp(str, r->pattern, len) == 0)
+                return 1;
+        return 0;
+}
+static int regex_match_front(char *str, struct regex *r, int len)
+{
+        if (strncmp(str, r->pattern, r->len) == 0)
+                return 1;
+        return 0;
+}
+static int regex_match_middle(char *str, struct regex *r, int len)
+{
+        if (strnstr(str, r->pattern, len))
+                return 1;
+        return 0;
+}
+static int regex_match_end(char *str, struct regex *r, int len)
+{
+        int strlen = len - 1;
+        if (strlen >= r->len &&
+            memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
+                return 1;
+        return 0;
+}
+/**
+ * filter_parse_regex - parse a basic regex
+ * @buff:   the raw regex
+ * @len:    length of the regex
+ * @search: will point to the beginning of the string to compare
+ * @not:    tell whether the match will have to be inverted
+ *
+ * This passes in a buffer containing a regex and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ *  search returns the pointer to use for comparison.
+ *  not returns 1 if buff started with a '!'
+ *     0 otherwise.
+ */
+enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
+{
+        int type = MATCH_FULL;
+        int i;
+        if (buff[0] == '!') {
+                *not = 1;
+                buff++;
+                len--;
+        } else
+                *not = 0;
+        *search = buff;
+        for (i = 0; i < len; i++) {
+                if (buff[i] == '*') {
+                        if (!i) {
+                                *search = buff + 1;
+                                type = MATCH_END_ONLY;
+                        } else {
+                                if (type == MATCH_END_ONLY)
+                                        type = MATCH_MIDDLE_ONLY;
+                                else
+                                        type = MATCH_FRONT_ONLY;
+                                buff[i] = 0;
+                                break;
+                        }
+                }
+        }
+        return type;
+}
+static void filter_build_regex(struct filter_pred *pred)
+{
+        struct regex *r = &pred->regex;
+        char *search;
+        enum regex_type type = MATCH_FULL;
+        int not = 0;
+        if (pred->op == OP_GLOB) {
+                type = filter_parse_regex(r->pattern, r->len, &search, &not);
+                r->len = strlen(search);
+                memmove(r->pattern, search, r->len+1);
+        }
+        switch (type) {
+        case MATCH_FULL:
+                r->match = regex_match_full;
+                break;
+        case MATCH_FRONT_ONLY:
+                r->match = regex_match_front;
+                break;
+        case MATCH_MIDDLE_ONLY:
+                r->match = regex_match_middle;
+                break;
+        case MATCH_END_ONLY:
+                r->match = regex_match_end;
+                break;
+        }
+        pred->not ^= not;
+}
 /* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
 {
-        struct event_filter *filter = call->filter;
        int match, top = 0, val1 = 0, val2 = 0;
        int stack[MAX_FILTER_PRED];
        struct filter_pred *pred;
@@ -396,7 +522,7 @@ static void filter_clear_pred(struct filter_pred *pred)
 {
        kfree(pred->field_name);
        pred->field_name = NULL;
-        pred->str_len = 0;
+        pred->regex.len = 0;
 }
 static int filter_set_pred(struct filter_pred *dest,
@@ -426,9 +552,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
                filter->preds[i]->fn = filter_pred_none;
 }
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
 {
-        struct event_filter *filter = call->filter;
        int i;
        if (!filter)
@@ -441,21 +566,24 @@ void destroy_preds(struct ftrace_event_call *call)
        kfree(filter->preds);
        kfree(filter->filter_string);
        kfree(filter);
+}
+void destroy_preds(struct ftrace_event_call *call)
+{
+        __free_preds(call->filter);
        call->filter = NULL;
+        call->filter_active = 0;
 }
-static int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
 {
        struct event_filter *filter;
        struct filter_pred *pred;
        int i;
-        if (call->filter)
+        filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-                return 0;
+        if (!filter)
+                return ERR_PTR(-ENOMEM);
-        filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-        if (!call->filter)
-                return -ENOMEM;
        filter->n_preds = 0;
@@ -471,12 +599,24 @@ static int init_preds(struct ftrace_event_call *call)
                filter->preds[i] = pred;
        }
-        return 0;
+        return filter;
 oom:
-        destroy_preds(call);
+        __free_preds(filter);
+        return ERR_PTR(-ENOMEM);
+}
+static int init_preds(struct ftrace_event_call *call)
+{
+        if (call->filter)
+                return 0;
-        return -ENOMEM;
+        call->filter_active = 0;
+        call->filter = __alloc_preds();
+        if (IS_ERR(call->filter))
+                return PTR_ERR(call->filter);
+        return 0;
 }
 static int init_subsystem_preds(struct event_subsystem *system)
@@ -499,14 +639,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
        return 0;
 }
-enum {
+static void filter_free_subsystem_preds(struct event_subsystem *system)
-        FILTER_DISABLE_ALL,
-        FILTER_INIT_NO_RESET,
-        FILTER_SKIP_NO_RESET,
-};
-static void filter_free_subsystem_preds(struct event_subsystem *system,
-                                        int flag)
 {
        struct ftrace_event_call *call;
@@ -517,14 +650,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
                if (strcmp(call->system, system->name) != 0)
                        continue;
-                if (flag == FILTER_INIT_NO_RESET) {
-                        call->filter->no_reset = false;
-                        continue;
-                }
-                if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
-                        continue;
                filter_disable_preds(call);
                remove_filter_string(call->filter);
        }
@@ -532,10 +657,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
 static int filter_add_pred_fn(struct filter_parse_state *ps,
                              struct ftrace_event_call *call,
+                              struct event_filter *filter,
                              struct filter_pred *pred,
                              filter_pred_fn_t fn)
 {
-        struct event_filter *filter = call->filter;
        int idx, err;
        if (filter->n_preds == MAX_FILTER_PRED) {
@@ -550,7 +675,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
                return err;
        filter->n_preds++;
-        call->filter_active = 1;
        return 0;
 }
@@ -575,7 +699,10 @@ static bool is_string_field(struct ftrace_event_field *field)
 static int is_legal_op(struct ftrace_event_field *field, int op)
 {
-        if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
+        if (is_string_field(field) &&
+            (op != OP_EQ && op != OP_NE && op != OP_GLOB))
+                return 0;
+        if (!is_string_field(field) && op == OP_GLOB)
                return 0;
        return 1;
@@ -626,6 +753,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 static int filter_add_pred(struct filter_parse_state *ps,
                           struct ftrace_event_call *call,
+                           struct event_filter *filter,
                           struct filter_pred *pred,
                           bool dry_run)
 {
@@ -660,21 +788,20 @@ static int filter_add_pred(struct filter_parse_state *ps,
        }
        if (is_string_field(field)) {
-                pred->str_len = field->size;
+                filter_build_regex(pred);
-                if (field->filter_type == FILTER_STATIC_STRING)
+                if (field->filter_type == FILTER_STATIC_STRING) {
                        fn = filter_pred_string;
-                else if (field->filter_type == FILTER_DYN_STRING)
+                        pred->regex.field_len = field->size;
+                } else if (field->filter_type == FILTER_DYN_STRING)
                        fn = filter_pred_strloc;
-                else {
+                else
                        fn = filter_pred_pchar;
-                        pred->str_len = strlen(pred->str_val);
-                }
        } else {
                if (field->is_signed)
-                        ret = strict_strtoll(pred->str_val, 0, &val);
+                        ret = strict_strtoll(pred->regex.pattern, 0, &val);
                else
-                        ret = strict_strtoull(pred->str_val, 0, &val);
+                        ret = strict_strtoull(pred->regex.pattern, 0, &val);
                if (ret) {
                        parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
                        return -EINVAL;
@@ -694,45 +821,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 add_pred_fn:
        if (!dry_run)
-                return filter_add_pred_fn(ps, call, pred, fn);
+                return filter_add_pred_fn(ps, call, filter, pred, fn);
-        return 0;
-}
-static int filter_add_subsystem_pred(struct filter_parse_state *ps,
-                                     struct event_subsystem *system,
-                                     struct filter_pred *pred,
-                                     char *filter_string,
-                                     bool dry_run)
-{
-        struct ftrace_event_call *call;
-        int err = 0;
-        bool fail = true;
-        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->define_fields)
-                        continue;
-                if (strcmp(call->system, system->name))
-                        continue;
-                if (call->filter->no_reset)
-                        continue;
-                err = filter_add_pred(ps, call, pred, dry_run);
-                if (err)
-                        call->filter->no_reset = true;
-                else
-                        fail = false;
-                if (!dry_run)
-                        replace_filter_string(call->filter, filter_string);
-        }
-        if (fail) {
-                parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-                return err;
-        }
        return 0;
 }
@@ -1045,8 +1134,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
                return NULL;
        }
-        strcpy(pred->str_val, operand2);
+        strcpy(pred->regex.pattern, operand2);
-        pred->str_len = strlen(operand2);
+        pred->regex.len = strlen(pred->regex.pattern);
        pred->op = op;
@@ -1090,8 +1179,8 @@ static int check_preds(struct filter_parse_state *ps)
        return 0;
 }
-static int replace_preds(struct event_subsystem *system,
+static int replace_preds(struct ftrace_event_call *call,
-                         struct ftrace_event_call *call,
+                         struct event_filter *filter,
                         struct filter_parse_state *ps,
                         char *filter_string,
                         bool dry_run)
@@ -1138,11 +1227,7 @@ static int replace_preds(struct event_subsystem *system,
 add_pred:
                if (!pred)
                        return -ENOMEM;
-                if (call)
+                err = filter_add_pred(ps, call, filter, pred, dry_run);
-                        err = filter_add_pred(ps, call, pred, false);
-                else
-                        err = filter_add_subsystem_pred(ps, system, pred,
-                                                filter_string, dry_run);
                filter_free_pred(pred);
                if (err)
                        return err;
@@ -1153,10 +1238,50 @@ add_pred:
        return 0;
 }
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+static int replace_system_preds(struct event_subsystem *system,
+                                struct filter_parse_state *ps,
+                                char *filter_string)
 {
+        struct ftrace_event_call *call;
+        bool fail = true;
        int err;
+        list_for_each_entry(call, &ftrace_events, list) {
+                struct event_filter *filter = call->filter;
+                if (!call->define_fields)
+                        continue;
+                if (strcmp(call->system, system->name) != 0)
+                        continue;
+                /* try to see if the filter can be applied */
+                err = replace_preds(call, filter, ps, filter_string, true);
+                if (err)
+                        continue;
+                /* really apply the filter */
+                filter_disable_preds(call);
+                err = replace_preds(call, filter, ps, filter_string, false);
+                if (err)
+                        filter_disable_preds(call);
+                else {
+                        call->filter_active = 1;
+                        replace_filter_string(filter, filter_string);
+                }
+                fail = false;
+        }
+        if (fail) {
+                parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                return -EINVAL;
+        }
+        return 0;
+}
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+        int err;
        struct filter_parse_state *ps;
        mutex_lock(&event_mutex);
@@ -1168,8 +1293,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_disable_preds(call);
                remove_filter_string(call->filter);
-                mutex_unlock(&event_mutex);
+                goto out_unlock;
-                return 0;
        }
        err = -ENOMEM;
@@ -1187,10 +1311,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                goto out;
        }
-        err = replace_preds(NULL, call, ps, filter_string, false);
+        err = replace_preds(call, call->filter, ps, filter_string, false);
        if (err)
                append_filter_err(ps, call->filter);
+        else
+                call->filter_active = 1;
 out:
        filter_opstack_clear(ps);
        postfix_clear(ps);
@@ -1205,7 +1330,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                                 char *filter_string)
 {
        int err;
        struct filter_parse_state *ps;
        mutex_lock(&event_mutex);
@@ -1215,10 +1339,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out_unlock;
        if (!strcmp(strstrip(filter_string), "0")) {
-                filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
+                filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
-                mutex_unlock(&event_mutex);
+                goto out_unlock;
-                return 0;
        }
        err = -ENOMEM;
@@ -1235,31 +1358,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out;
        }
-        filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
+        err = replace_system_preds(system, ps, filter_string);
+        if (err)
-        /* try to see the filter can be applied to which events */
-        err = replace_preds(system, NULL, ps, filter_string, true);
-        if (err) {
                append_filter_err(ps, system->filter);
-                goto out;
+out:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
+out_unlock:
+        mutex_unlock(&event_mutex);
+        return err;
+}
+#ifdef CONFIG_PERF_EVENTS
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+        struct event_filter *filter = event->filter;
+        event->filter = NULL;
+        __free_preds(filter);
+}
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+                              char *filter_str)
+{
+        int err;
+        struct event_filter *filter;
+        struct filter_parse_state *ps;
+        struct ftrace_event_call *call = NULL;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (call->id == event_id)
+                        break;
        }
-        filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
+        err = -EINVAL;
+        if (!call)
+                goto out_unlock;
+        err = -EEXIST;
+        if (event->filter)
+                goto out_unlock;
-        /* really apply the filter to the events */
+        filter = __alloc_preds();
-        err = replace_preds(system, NULL, ps, filter_string, false);
+        if (IS_ERR(filter)) {
-        if (err) {
+                err = PTR_ERR(filter);
-                append_filter_err(ps, system->filter);
+                goto out_unlock;
-                filter_free_subsystem_preds(system, 2);
        }
-out:
+        err = -ENOMEM;
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
+                goto free_preds;
+        parse_init(ps, filter_ops, filter_str);
+        err = filter_parse(ps);
+        if (err)
+                goto free_ps;
+        err = replace_preds(call, filter, ps, filter_str, false);
+        if (!err)
+                event->filter = filter;
+free_ps:
        filter_opstack_clear(ps);
        postfix_clear(ps);
        kfree(ps);
+free_preds:
+        if (err)
+                __free_preds(filter);
 out_unlock:
        mutex_unlock(&event_mutex);
        return err;
 }
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 9753fcc61bc5..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -48,11 +48,11 @@
 struct ____ftrace_##name {                                      \
        tstruct                                                 \
 };                                                              \
-static void __used ____ftrace_check_##name(void)                \
+static void __always_unused ____ftrace_check_##name(void)       \
 {                                                               \
        struct ____ftrace_##name *__entry = NULL;               \
                                                                \
-        /* force cmpile-time check on F_printk() */             \
+        /* force compile-time check on F_printk() */            \
        printk(print);                                          \
 }
@@ -62,76 +62,6 @@ static void __used ____ftrace_check_##name(void)		\
 #include "trace_entries.h"
-#undef __field
-#define __field(type, item)                                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\n",              \
-                               offsetof(typeof(field), item),           \
-                               sizeof(field.item));                     \
-        if (!ret)                                                       \
-                return 0;
-#undef __field_desc
-#define __field_desc(type, container, item)                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\n",              \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item));           \
-        if (!ret)                                                       \
-                return 0;
-#undef __array
-#define __array(type, item, len)                                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\n",              \
-                               offsetof(typeof(field), item),   \
-                               sizeof(field.item));             \
-        if (!ret)                                                       \
-                return 0;
-#undef __array_desc
-#define __array_desc(type, container, item, len)                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\n",              \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item));           \
-        if (!ret)                                                       \
-                return 0;
-#undef __dynamic_array
-#define __dynamic_array(type, item)                                     \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:0;\n",                \
-                               offsetof(typeof(field), item));          \
-        if (!ret)                                                       \
-                return 0;
-#undef F_printk
-#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-#undef __entry
-#define __entry REC
-#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
-static int                                                              \
-ftrace_format_##name(struct ftrace_event_call *unused,                  \
-                     struct trace_seq *s)                               \
-{                                                                       \
-        struct struct_name field __attribute__((unused));               \
-        int ret = 0;                                                    \
-                                                                        \
-        tstruct;                                                        \
-                                                                        \
-        trace_seq_printf(s, "\nprint fmt: " print);                     \
-                                                                        \
-        return ret;                                                     \
-}
-#include "trace_entries.h"
 #undef __field
 #define __field(type, item)                                             \
        ret = trace_define_field(event_call, #type, #item,              \
@@ -156,7 +86,8 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
        BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
                                 offsetof(typeof(field), item),         \
-                                 sizeof(field.item), 0, FILTER_OTHER);  \
+                                 sizeof(field.item),                    \
+                                 is_signed_type(type), FILTER_OTHER);   \
        if (ret)                                                        \
                return ret;
@@ -166,13 +97,18 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
        ret = trace_define_field(event_call, #type "[" #len "]", #item, \
                                 offsetof(typeof(field),                \
                                          container.item),              \
-                                 sizeof(field.container.item), 0,       \
+                                 sizeof(field.container.item),          \
-                                 FILTER_OTHER);                         \
+                                 is_signed_type(type), FILTER_OTHER);   \
        if (ret)                                                        \
                return ret;
 #undef __dynamic_array
-#define __dynamic_array(type, item)
+#define __dynamic_array(type, item)                                     \
+        ret = trace_define_field(event_call, #type, #item,              \
+                                 offsetof(typeof(field), item),         \
+                                 0, is_signed_type(type), FILTER_OTHER);\
+        if (ret)                                                        \
+                return ret;
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
@@ -182,10 +118,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
        struct struct_name field;                                       \
        int ret;                                                        \
                                                                        \
-        ret = trace_define_common_fields(event_call);                   \
-        if (ret)                                                        \
-                return ret;                                             \
-                                                                        \
        tstruct;                                                        \
                                                                        \
        return ret;                                                     \
@@ -193,6 +125,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #include "trace_entries.h"
+static int ftrace_raw_init_event(struct ftrace_event_call *call)
+{
+        INIT_LIST_HEAD(&call->fields);
+        return 0;
+}
+#undef __entry
+#define __entry REC
 #undef __field
 #define __field(type, item)
@@ -209,9 +149,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #undef __dynamic_array
 #define __dynamic_array(type, item)
+#undef F_printk
+#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, type, tstruct, print)           \
-static int ftrace_raw_init_event_##call(void);                          \
                                                                        \
 struct ftrace_event_call __used                                         \
 __attribute__((__aligned__(4)))                                         \
@@ -219,14 +161,9 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
        .name                   = #call,                                \
        .id                     = type,                                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
-        .raw_init               = ftrace_raw_init_event_##call,         \
+        .raw_init               = ftrace_raw_init_event,                \
-        .show_format            = ftrace_format_##call,                 \
+        .print_fmt              = print,                                \
        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                      \
-static int ftrace_raw_init_event_##call(void)                           \
-{                                                                       \
-        INIT_LIST_HEAD(&event_##call.fields);                           \
-        return 0;                                                       \
-}                                                                       \
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..9aed1a5cf553 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,14 +9,27 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include "trace.h"
 #include "trace_output.h"
-struct fgraph_data {
+struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+        int             ignore;
+        unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
+};
+struct fgraph_data {
+        struct fgraph_cpu_data          *cpu_data;
+        /* Place to preserve last processed entry. */
+        struct ftrace_graph_ent_entry   ent;
+        struct ftrace_graph_ret_entry   ret;
+        int                             failed;
+        int                             cpu;
 };
 #define TRACE_GRAPH_INDENT      2
@@ -176,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ent_entry *entry;
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return 0;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -201,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        int cpu;
        int pc;
-        if (unlikely(!tr))
-                return 0;
        if (!ftrace_trace_task(current))
                return 0;
-        if (!ftrace_graph_addr(trace->func))
+        /* trace it when it is-nested-in or is a function enabled. */
+        if (!(trace->depth || ftrace_graph_addr(trace->func)))
                return 0;
        local_irq_save(flags);
@@ -220,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        } else {
                ret = 0;
        }
-        /* Only do the atomic if it is not already set */
-        if (!test_tsk_trace_graph(current))
-                set_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
@@ -230,6 +238,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        return ret;
 }
+int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+        if (tracing_thresh)
+                return 1;
+        else
+                return trace_graph_entry(trace);
+}
 static void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
@@ -240,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ret_entry *entry;
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -270,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
                pc = preempt_count();
                __trace_graph_return(tr, trace, flags, pc);
        }
-        if (!trace->depth)
-                clear_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
 }
+void set_graph_array(struct trace_array *tr)
+{
+        graph_array = tr;
+        /* Make graph_array visible before we start tracing */
+        smp_mb();
+}
+void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+        if (tracing_thresh &&
+            (trace->rettime - trace->calltime < tracing_thresh))
+                return;
+        else
+                trace_graph_return(trace);
+}
 static int graph_trace_init(struct trace_array *tr)
 {
        int ret;
-        graph_array = tr;
+        set_graph_array(tr);
-        ret = register_ftrace_graph(&trace_graph_return,
+        if (tracing_thresh)
-                                    &trace_graph_entry);
+                ret = register_ftrace_graph(&trace_graph_thresh_return,
+                                            &trace_graph_thresh_entry);
+        else
+                ret = register_ftrace_graph(&trace_graph_return,
+                                            &trace_graph_entry);
        if (ret)
                return ret;
        tracing_start_cmdline_record();
@@ -290,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr)
        return 0;
 }
-void set_graph_array(struct trace_array *tr)
-{
-        graph_array = tr;
-}
 static void graph_trace_reset(struct trace_array *tr)
 {
        tracing_stop_cmdline_record();
@@ -384,7 +415,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
        if (!data)
                return TRACE_TYPE_HANDLED;
-        last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
        if (*last_pid == pid)
                return TRACE_TYPE_HANDLED;
@@ -435,26 +466,49 @@ static struct ftrace_graph_ret_entry *
 get_return_for_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *curr)
 {
-        struct ring_buffer_iter *ring_iter;
+        struct fgraph_data *data = iter->private;
+        struct ring_buffer_iter *ring_iter = NULL;
        struct ring_buffer_event *event;
        struct ftrace_graph_ret_entry *next;
-        ring_iter = iter->buffer_iter[iter->cpu];
+        /*
+         * If the previous output failed to write to the seq buffer,
+         * then we just reuse the data from before.
+         */
+        if (data && data->failed) {
+                curr = &data->ent;
+                next = &data->ret;
+        } else {
-        /* First peek to compare current entry and the next one */
+                ring_iter = iter->buffer_iter[iter->cpu];
-        if (ring_iter)
-                event = ring_buffer_iter_peek(ring_iter, NULL);
+                /* First peek to compare current entry and the next one */
-        else {
+                if (ring_iter)
-        /* We need to consume the current entry to see the next one */
+                        event = ring_buffer_iter_peek(ring_iter, NULL);
-                ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                else {
-                event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                        /*
-                                        NULL);
+                         * We need to consume the current entry to see
-        }
+                         * the next one.
+                         */
+                        ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                                                 NULL);
+                }
-        if (!event)
+                if (!event)
-                return NULL;
+                        return NULL;
-        next = ring_buffer_event_data(event);
+                next = ring_buffer_event_data(event);
+                if (data) {
+                        /*
+                         * Save current and next entries for later reference
+                         * if the output fails.
+                         */
+                        data->ent = *curr;
+                        data->ret = *next;
+                }
+        }
        if (next->ent.type != TRACE_GRAPH_RET)
                return NULL;
@@ -639,15 +693,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        duration = graph_ret->rettime - graph_ret->calltime;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. Since
                 * this is a leaf function, keep the comments
                 * equal to this depth.
                 */
-                *depth = call->depth - 1;
+                cpu_data->depth = call->depth - 1;
+                /* No need to keep this function around for this depth */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = 0;
        }
        /* Overhead */
@@ -687,10 +747,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
-                *depth = call->depth;
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+                cpu_data->depth = call->depth;
+                /* Save this function pointer to see if the exit matches */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = call->func;
        }
        /* No overhead */
@@ -782,19 +847,34 @@ static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter)
 {
-        int cpu = iter->cpu;
+        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
        struct ftrace_graph_ret_entry *leaf_ret;
+        static enum print_line_t ret;
+        int cpu = iter->cpu;
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
                return TRACE_TYPE_PARTIAL_LINE;
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-                return print_graph_entry_leaf(iter, field, leaf_ret, s);
+                ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
        else
-                return print_graph_entry_nested(iter, field, s, cpu);
+                ret = print_graph_entry_nested(iter, field, s, cpu);
+        if (data) {
+                /*
+                 * If we failed to write our output, then we need to make
+                 * note of it. Because we already consumed our entry.
+                 */
+                if (s->full) {
+                        data->failed = 1;
+                        data->cpu = cpu;
+                } else
+                        data->failed = 0;
+        }
+        return ret;
 }
 static enum print_line_t
@@ -805,19 +885,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        struct fgraph_data *data = iter->private;
        pid_t pid = ent->pid;
        int cpu = iter->cpu;
+        int func_match = 1;
        int ret;
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. This is the
                 * return from a function, we now want the comments
                 * to display at the same level of the bracket.
                 */
-                *depth = trace->depth - 1;
+                cpu_data->depth = trace->depth - 1;
+                if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+                        if (cpu_data->enter_funcs[trace->depth] != trace->func)
+                                func_match = 0;
+                        cpu_data->enter_funcs[trace->depth] = 0;
+                }
        }
        if (print_graph_prologue(iter, s, 0, 0))
@@ -842,9 +931,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
-        ret = trace_seq_printf(s, "}\n");
+        /*
-        if (!ret)
+         * If the return function does not have a matching entry,
-                return TRACE_TYPE_PARTIAL_LINE;
+         * then the entry was lost. Instead of just printing
+         * the '}' and letting the user guess what function this
+         * belongs to, write out the function name.
+         */
+        if (func_match) {
+                ret = trace_seq_printf(s, "}\n");
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        } else {
+                ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        }
        /* Overrun */
        if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
@@ -873,7 +974,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
        int i;
        if (data)
-                depth = per_cpu_ptr(data, iter->cpu)->depth;
+                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
        if (print_graph_prologue(iter, s, 0, 0))
                return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +1042,33 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
+        struct ftrace_graph_ent_entry *field;
+        struct fgraph_data *data = iter->private;
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
+        int cpu = iter->cpu;
+        int ret;
+        if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
+                per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
+                return TRACE_TYPE_HANDLED;
+        }
+        /*
+         * If the last output failed, there's a possibility we need
+         * to print out the missing entry which would never go out.
+         */
+        if (data && data->failed) {
+                field = &data->ent;
+                iter->cpu = data->cpu;
+                ret = print_graph_entry(field, s, iter);
+                if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
+                        per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
+                        ret = TRACE_TYPE_NO_CONSUME;
+                }
+                iter->cpu = cpu;
+                return ret;
+        }
        switch (entry->type) {
        case TRACE_GRAPH_ENT: {
@@ -952,7 +1078,7 @@ print_graph_function(struct trace_iterator *iter)
                 * sizeof(struct ftrace_graph_ent_entry) is very small,
                 * it can be safely saved at the stack.
                 */
-                struct ftrace_graph_ent_entry *field, saved;
+                struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
                return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1156,54 @@ static void print_graph_headers(struct seq_file *s)
 static void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
-        struct fgraph_data *data = alloc_percpu(struct fgraph_data);
+        struct fgraph_data *data;
        int cpu;
+        iter->private = NULL;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
-                pr_warning("function graph tracer: not enough memory\n");
+                goto out_err;
-        else
-                for_each_possible_cpu(cpu) {
+        data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
-                        pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        if (!data->cpu_data)
-                        int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                goto out_err_free;
-                        *pid = -1;
-                        *depth = 0;
+        for_each_possible_cpu(cpu) {
-                }
+                pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+                *pid = -1;
+                *depth = 0;
+                *ignore = 0;
+        }
        iter->private = data;
+        return;
+ out_err_free:
+        kfree(data);
+ out_err:
+        pr_warning("function graph tracer: not enough memory\n");
 }
 static void graph_trace_close(struct trace_iterator *iter)
 {
-        free_percpu(iter->private);
+        struct fgraph_data *data = iter->private;
+        if (data) {
+                free_percpu(data->cpu_data);
+                kfree(data);
+        }
 }
 static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
+        .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
+        .pipe_close     = graph_trace_close,
        .wait_pipe      = poll_wait_pipe,
        .init           = graph_trace_init,
        .reset          = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd5..7b97000745f5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
 #define BTS_BUFFER_SIZE (1 << 13)
-static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
 static int trace_hw_branches_enabled __read_mostly;
 static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
 static void bts_trace_init_cpu(int cpu)
 {
-        per_cpu(tracer, cpu) =
+        per_cpu(hwb_tracer, cpu) =
-                ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+                ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-                                   NULL, (size_t)-1, BTS_KERNEL);
+                                   BTS_BUFFER_SIZE, NULL, (size_t)-1,
+                                   BTS_KERNEL);
-        if (IS_ERR(per_cpu(tracer, cpu)))
+        if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-                per_cpu(tracer, cpu) = NULL;
+                per_cpu(hwb_tracer, cpu) = NULL;
 }
 static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
        for_each_online_cpu(cpu) {
                bts_trace_init_cpu(cpu);
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
                        trace_hw_branches_enabled = 1;
        }
        trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu) {
-                if (likely(per_cpu(tracer, cpu))) {
+                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(tracer, cpu));
+                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(tracer, cpu) = NULL;
+                        per_cpu(hwb_tracer, cpu) = NULL;
                }
        }
        trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(tracer, cpu));
+                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
        trace_hw_branches_suspended = 0;
        put_online_cpus();
 }
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(tracer, cpu));
+                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
        trace_hw_branches_suspended = 1;
        put_online_cpus();
 }
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
                        bts_trace_init_cpu(cpu);
                        if (trace_hw_branches_suspended &&
-                            likely(per_cpu(tracer, cpu)))
+                            likely(per_cpu(hwb_tracer, cpu)))
-                                ds_suspend_bts(per_cpu(tracer, cpu));
+                                ds_suspend_bts(per_cpu(hwb_tracer, cpu));
                }
                break;
        case CPU_DOWN_PREPARE:
                /* The notification is sent with interrupts enabled. */
-                if (likely(per_cpu(tracer, cpu))) {
+                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(tracer, cpu));
+                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(tracer, cpu) = NULL;
+                        per_cpu(hwb_tracer, cpu) = NULL;
                }
        }
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(tracer, cpu));
+                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
        /*
         * We need to collect the trace on the respective cpu since ftrace
         * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
        on_each_cpu(trace_bts_cpu, iter->tr, 1);
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(tracer, cpu));
+                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
        put_online_cpus();
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3aa7eaa2114c..2974bc7538c7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr,
                goto out_unlock;
        trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+        /* Skip 5 functions to get to the irq/preempt enable function */
+        __trace_stack(tr, flags, 5, pc);
        if (data->critical_sequence != max_sequence)
                goto out_unlock;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..1251e367bae9
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1488 @@
+/*
+ * Kprobes-based tracing events
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include "trace.h"
+#include "trace_output.h"
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define KPROBE_EVENT_SYSTEM "kprobes"
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_NARGS "__probe_nargs"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+const char *reserved_field_names[] = {
+        "common_type",
+        "common_flags",
+        "common_preempt_count",
+        "common_pid",
+        "common_tgid",
+        "common_lock_depth",
+        FIELD_STRING_IP,
+        FIELD_STRING_NARGS,
+        FIELD_STRING_RETIP,
+        FIELD_STRING_FUNC,
+};
+struct fetch_func {
+        unsigned long (*func)(struct pt_regs *, void *);
+        void *data;
+};
+static __kprobes unsigned long call_fetch(struct fetch_func *f,
+                                          struct pt_regs *regs)
+{
+        return f->func(regs, f->data);
+}
+/* fetch handlers */
+static __kprobes unsigned long fetch_register(struct pt_regs *regs,
+                                              void *offset)
+{
+        return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+}
+static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
+                                           void *num)
+{
+        return regs_get_kernel_stack_nth(regs,
+                                         (unsigned int)((unsigned long)num));
+}
+static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+{
+        unsigned long retval;
+        if (probe_kernel_address(addr, retval))
+                return 0;
+        return retval;
+}
+static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+                                              void *dummy)
+{
+        return regs_return_value(regs);
+}
+static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
+                                                   void *dummy)
+{
+        return kernel_stack_pointer(regs);
+}
+/* Memory fetching by symbol */
+struct symbol_cache {
+        char *symbol;
+        long offset;
+        unsigned long addr;
+};
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+        if (sc->addr)
+                sc->addr += sc->offset;
+        return sc->addr;
+}
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+        kfree(sc->symbol);
+        kfree(sc);
+}
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+        struct symbol_cache *sc;
+        if (!sym || strlen(sym) == 0)
+                return NULL;
+        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+        if (!sc)
+                return NULL;
+        sc->symbol = kstrdup(sym, GFP_KERNEL);
+        if (!sc->symbol) {
+                kfree(sc);
+                return NULL;
+        }
+        sc->offset = offset;
+        update_symbol_cache(sc);
+        return sc;
+}
+static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+{
+        struct symbol_cache *sc = data;
+        if (sc->addr)
+                return fetch_memory(regs, (void *)sc->addr);
+        else
+                return 0;
+}
+/* Special indirect memory access interface */
+struct indirect_fetch_data {
+        struct fetch_func orig;
+        long offset;
+};
+static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+{
+        struct indirect_fetch_data *ind = data;
+        unsigned long addr;
+        addr = call_fetch(&ind->orig, regs);
+        if (addr) {
+                addr += ind->offset;
+                return fetch_memory(regs, (void *)addr);
+        } else
+                return 0;
+}
+static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+{
+        if (data->orig.func == fetch_indirect)
+                free_indirect_fetch_data(data->orig.data);
+        else if (data->orig.func == fetch_symbol)
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
+/**
+ * Kprobe event core functions
+ */
+struct probe_arg {
+        struct fetch_func       fetch;
+        const char              *name;
+};
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE   1
+#define TP_FLAG_PROFILE 2
+struct trace_probe {
+        struct list_head        list;
+        struct kretprobe        rp;     /* Use rp.kp for kprobe use */
+        unsigned long           nhit;
+        unsigned int            flags;  /* For TP_FLAG_* */
+        const char              *symbol;        /* symbol name */
+        struct ftrace_event_call        call;
+        struct trace_event              event;
+        unsigned int            nr_args;
+        struct probe_arg        args[];
+};
+#define SIZEOF_TRACE_PROBE(n)                   \
+        (offsetof(struct trace_probe, args) +   \
+        (sizeof(struct probe_arg) * (n)))
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+        return tp->rp.handler != NULL;
+}
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+        return tp->symbol ? tp->symbol : "unknown";
+}
+static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
+{
+        int ret = -EINVAL;
+        if (ff->func == fetch_register) {
+                const char *name;
+                name = regs_query_register_name((unsigned int)((long)ff->data));
+                ret = snprintf(buf, n, "%%%s", name);
+        } else if (ff->func == fetch_stack)
+                ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
+        else if (ff->func == fetch_memory)
+                ret = snprintf(buf, n, "@0x%p", ff->data);
+        else if (ff->func == fetch_symbol) {
+                struct symbol_cache *sc = ff->data;
+                if (sc->offset)
+                        ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
+                                        sc->offset);
+                else
+                        ret = snprintf(buf, n, "@%s", sc->symbol);
+        } else if (ff->func == fetch_retvalue)
+                ret = snprintf(buf, n, "$retval");
+        else if (ff->func == fetch_stack_address)
+                ret = snprintf(buf, n, "$stack");
+        else if (ff->func == fetch_indirect) {
+                struct indirect_fetch_data *id = ff->data;
+                size_t l = 0;
+                ret = snprintf(buf, n, "%+ld(", id->offset);
+                if (ret >= n)
+                        goto end;
+                l += ret;
+                ret = probe_arg_string(buf + l, n - l, &id->orig);
+                if (ret < 0)
+                        goto end;
+                l += ret;
+                ret = snprintf(buf + l, n - l, ")");
+                ret += l;
+        }
+end:
+        if (ret >= n)
+                return -ENOSPC;
+        return ret;
+}
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+                                struct pt_regs *regs);
+/* Check the name is good for event/group */
+static int check_event_name(const char *name)
+{
+        if (!isalpha(*name) && *name != '_')
+                return 0;
+        while (*++name != '\0') {
+                if (!isalpha(*name) && !isdigit(*name) && *name != '_')
+                        return 0;
+        }
+        return 1;
+}
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_probe *alloc_trace_probe(const char *group,
+                                             const char *event,
+                                             void *addr,
+                                             const char *symbol,
+                                             unsigned long offs,
+                                             int nargs, int is_return)
+{
+        struct trace_probe *tp;
+        int ret = -ENOMEM;
+        tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
+        if (!tp)
+                return ERR_PTR(ret);
+        if (symbol) {
+                tp->symbol = kstrdup(symbol, GFP_KERNEL);
+                if (!tp->symbol)
+                        goto error;
+                tp->rp.kp.symbol_name = tp->symbol;
+                tp->rp.kp.offset = offs;
+        } else
+                tp->rp.kp.addr = addr;
+        if (is_return)
+                tp->rp.handler = kretprobe_dispatcher;
+        else
+                tp->rp.kp.pre_handler = kprobe_dispatcher;
+        if (!event || !check_event_name(event)) {
+                ret = -EINVAL;
+                goto error;
+        }
+        tp->call.name = kstrdup(event, GFP_KERNEL);
+        if (!tp->call.name)
+                goto error;
+        if (!group || !check_event_name(group)) {
+                ret = -EINVAL;
+                goto error;
+        }
+        tp->call.system = kstrdup(group, GFP_KERNEL);
+        if (!tp->call.system)
+                goto error;
+        INIT_LIST_HEAD(&tp->list);
+        return tp;
+error:
+        kfree(tp->call.name);
+        kfree(tp->symbol);
+        kfree(tp);
+        return ERR_PTR(ret);
+}
+static void free_probe_arg(struct probe_arg *arg)
+{
+        if (arg->fetch.func == fetch_symbol)
+                free_symbol_cache(arg->fetch.data);
+        else if (arg->fetch.func == fetch_indirect)
+                free_indirect_fetch_data(arg->fetch.data);
+        kfree(arg->name);
+}
+static void free_trace_probe(struct trace_probe *tp)
+{
+        int i;
+        for (i = 0; i < tp->nr_args; i++)
+                free_probe_arg(&tp->args[i]);
+        kfree(tp->call.system);
+        kfree(tp->call.name);
+        kfree(tp->symbol);
+        kfree(tp);
+}
+static struct trace_probe *find_probe_event(const char *event,
+                                            const char *group)
+{
+        struct trace_probe *tp;
+        list_for_each_entry(tp, &probe_list, list)
+                if (strcmp(tp->call.name, event) == 0 &&
+                    strcmp(tp->call.system, group) == 0)
+                        return tp;
+        return NULL;
+}
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+        if (probe_is_return(tp))
+                unregister_kretprobe(&tp->rp);
+        else
+                unregister_kprobe(&tp->rp.kp);
+        list_del(&tp->list);
+        unregister_probe_event(tp);
+}
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+        struct trace_probe *old_tp;
+        int ret;
+        mutex_lock(&probe_lock);
+        /* register as an event */
+        old_tp = find_probe_event(tp->call.name, tp->call.system);
+        if (old_tp) {
+                /* delete old event */
+                unregister_trace_probe(old_tp);
+                free_trace_probe(old_tp);
+        }
+        ret = register_probe_event(tp);
+        if (ret) {
+                pr_warning("Faild to register probe event(%d)\n", ret);
+                goto end;
+        }
+        tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+        if (probe_is_return(tp))
+                ret = register_kretprobe(&tp->rp);
+        else
+                ret = register_kprobe(&tp->rp.kp);
+        if (ret) {
+                pr_warning("Could not insert probe(%d)\n", ret);
+                if (ret == -EILSEQ) {
+                        pr_warning("Probing address(0x%p) is not an "
+                                   "instruction boundary.\n",
+                                   tp->rp.kp.addr);
+                        ret = -EINVAL;
+                }
+                unregister_probe_event(tp);
+        } else
+                list_add_tail(&tp->list, &probe_list);
+end:
+        mutex_unlock(&probe_lock);
+        return ret;
+}
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, unsigned long *offset)
+{
+        char *tmp;
+        int ret;
+        if (!offset)
+                return -EINVAL;
+        tmp = strchr(symbol, '+');
+        if (tmp) {
+                /* skip sign because strict_strtol doesn't accept '+' */
+                ret = strict_strtoul(tmp + 1, 0, offset);
+                if (ret)
+                        return ret;
+                *tmp = '\0';
+        } else
+                *offset = 0;
+        return 0;
+}
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+{
+        int ret = 0;
+        unsigned long param;
+        if (strcmp(arg, "retval") == 0) {
+                if (is_return) {
+                        ff->func = fetch_retvalue;
+                        ff->data = NULL;
+                } else
+                        ret = -EINVAL;
+        } else if (strncmp(arg, "stack", 5) == 0) {
+                if (arg[5] == '\0') {
+                        ff->func = fetch_stack_address;
+                        ff->data = NULL;
+                } else if (isdigit(arg[5])) {
+                        ret = strict_strtoul(arg + 5, 10, &param);
+                        if (ret || param > PARAM_MAX_STACK)
+                                ret = -EINVAL;
+                        else {
+                                ff->func = fetch_stack;
+                                ff->data = (void *)param;
+                        }
+                } else
+                        ret = -EINVAL;
+        } else
+                ret = -EINVAL;
+        return ret;
+}
+/* Recursive argument parser */
+static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+        int ret = 0;
+        unsigned long param;
+        long offset;
+        char *tmp;
+        switch (arg[0]) {
+        case '$':
+                ret = parse_probe_vars(arg + 1, ff, is_return);
+                break;
+        case '%':       /* named register */
+                ret = regs_query_register_offset(arg + 1);
+                if (ret >= 0) {
+                        ff->func = fetch_register;
+                        ff->data = (void *)(unsigned long)ret;
+                        ret = 0;
+                }
+                break;
+        case '@':       /* memory or symbol */
+                if (isdigit(arg[1])) {
+                        ret = strict_strtoul(arg + 1, 0, &param);
+                        if (ret)
+                                break;
+                        ff->func = fetch_memory;
+                        ff->data = (void *)param;
+                } else {
+                        ret = split_symbol_offset(arg + 1, &offset);
+                        if (ret)
+                                break;
+                        ff->data = alloc_symbol_cache(arg + 1, offset);
+                        if (ff->data)
+                                ff->func = fetch_symbol;
+                        else
+                                ret = -EINVAL;
+                }
+                break;
+        case '+':       /* indirect memory */
+        case '-':
+                tmp = strchr(arg, '(');
+                if (!tmp) {
+                        ret = -EINVAL;
+                        break;
+                }
+                *tmp = '\0';
+                ret = strict_strtol(arg + 1, 0, &offset);
+                if (ret)
+                        break;
+                if (arg[0] == '-')
+                        offset = -offset;
+                arg = tmp + 1;
+                tmp = strrchr(arg, ')');
+                if (tmp) {
+                        struct indirect_fetch_data *id;
+                        *tmp = '\0';
+                        id = kzalloc(sizeof(struct indirect_fetch_data),
+                                     GFP_KERNEL);
+                        if (!id)
+                                return -ENOMEM;
+                        id->offset = offset;
+                        ret = __parse_probe_arg(arg, &id->orig, is_return);
+                        if (ret)
+                                kfree(id);
+                        else {
+                                ff->func = fetch_indirect;
+                                ff->data = (void *)id;
+                        }
+                } else
+                        ret = -EINVAL;
+                break;
+        default:
+                /* TODO: support custom handler */
+                ret = -EINVAL;
+        }
+        return ret;
+}
+/* String length checking wrapper */
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+        if (strlen(arg) > MAX_ARGSTR_LEN) {
+                pr_info("Argument is too long.: %s\n",  arg);
+                return -ENOSPC;
+        }
+        return __parse_probe_arg(arg, ff, is_return);
+}
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+                               struct probe_arg *args, int narg)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+                if (strcmp(reserved_field_names[i], name) == 0)
+                        return 1;
+        for (i = 0; i < narg; i++)
+                if (strcmp(args[i].name, name) == 0)
+                        return 1;
+        return 0;
+}
+static int create_trace_probe(int argc, char **argv)
+{
+        /*
+         * Argument syntax:
+         *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+         *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+         * Fetch args:
+         *  $retval     : fetch return value
+         *  $stack      : fetch stack address
+         *  $stackN     : fetch Nth of stack (N:0-)
+         *  @ADDR       : fetch memory at ADDR (ADDR should be in kernel)
+         *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+         *  %REG        : fetch register REG
+         * Indirect memory fetch:
+         *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
+         * Alias name of args:
+         *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+         */
+        struct trace_probe *tp;
+        int i, ret = 0;
+        int is_return = 0, is_delete = 0;
+        char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+        unsigned long offset = 0;
+        void *addr = NULL;
+        char buf[MAX_EVENT_NAME_LEN];
+        /* argc must be >= 1 */
+        if (argv[0][0] == 'p')
+                is_return = 0;
+        else if (argv[0][0] == 'r')
+                is_return = 1;
+        else if (argv[0][0] == '-')
+                is_delete = 1;
+        else {
+                pr_info("Probe definition must be started with 'p', 'r' or"
+                        " '-'.\n");
+                return -EINVAL;
+        }
+        if (argv[0][1] == ':') {
+                event = &argv[0][2];
+                if (strchr(event, '/')) {
+                        group = event;
+                        event = strchr(group, '/') + 1;
+                        event[-1] = '\0';
+                        if (strlen(group) == 0) {
+                                pr_info("Group name is not specified\n");
+                                return -EINVAL;
+                        }
+                }
+                if (strlen(event) == 0) {
+                        pr_info("Event name is not specified\n");
+                        return -EINVAL;
+                }
+        }
+        if (!group)
+                group = KPROBE_EVENT_SYSTEM;
+        if (is_delete) {
+                if (!event) {
+                        pr_info("Delete command needs an event name.\n");
+                        return -EINVAL;
+                }
+                tp = find_probe_event(event, group);
+                if (!tp) {
+                        pr_info("Event %s/%s doesn't exist.\n", group, event);
+                        return -ENOENT;
+                }
+                /* delete an event */
+                unregister_trace_probe(tp);
+                free_trace_probe(tp);
+                return 0;
+        }
+        if (argc < 2) {
+                pr_info("Probe point is not specified.\n");
+                return -EINVAL;
+        }
+        if (isdigit(argv[1][0])) {
+                if (is_return) {
+                        pr_info("Return probe point must be a symbol.\n");
+                        return -EINVAL;
+                }
+                /* an address specified */
+                ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
+                if (ret) {
+                        pr_info("Failed to parse address.\n");
+                        return ret;
+                }
+        } else {
+                /* a symbol specified */
+                symbol = argv[1];
+                /* TODO: support .init module functions */
+                ret = split_symbol_offset(symbol, &offset);
+                if (ret) {
+                        pr_info("Failed to parse symbol.\n");
+                        return ret;
+                }
+                if (offset && is_return) {
+                        pr_info("Return probe must be used without offset.\n");
+                        return -EINVAL;
+                }
+        }
+        argc -= 2; argv += 2;
+        /* setup a probe */
+        if (!event) {
+                /* Make a new event name */
+                if (symbol)
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
+                                 is_return ? 'r' : 'p', symbol, offset);
+                else
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
+                                 is_return ? 'r' : 'p', addr);
+                event = buf;
+        }
+        tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+                               is_return);
+        if (IS_ERR(tp)) {
+                pr_info("Failed to allocate trace_probe.(%d)\n",
+                        (int)PTR_ERR(tp));
+                return PTR_ERR(tp);
+        }
+        /* parse arguments */
+        ret = 0;
+        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+                /* Parse argument name */
+                arg = strchr(argv[i], '=');
+                if (arg)
+                        *arg++ = '\0';
+                else
+                        arg = argv[i];
+                if (conflict_field_name(argv[i], tp->args, i)) {
+                        pr_info("Argument%d name '%s' conflicts with "
+                                "another field.\n", i, argv[i]);
+                        ret = -EINVAL;
+                        goto error;
+                }
+                tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+                if (!tp->args[i].name) {
+                        pr_info("Failed to allocate argument%d name '%s'.\n",
+                                i, argv[i]);
+                        ret = -ENOMEM;
+                        goto error;
+                }
+                /* Parse fetch argument */
+                ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+                if (ret) {
+                        pr_info("Parse error at argument%d. (%d)\n", i, ret);
+                        kfree(tp->args[i].name);
+                        goto error;
+                }
+                tp->nr_args++;
+        }
+        ret = register_trace_probe(tp);
+        if (ret)
+                goto error;
+        return 0;
+error:
+        free_trace_probe(tp);
+        return ret;
+}
+static void cleanup_all_probes(void)
+{
+        struct trace_probe *tp;
+        mutex_lock(&probe_lock);
+        /* TODO: Use batch unregistration */
+        while (!list_empty(&probe_list)) {
+                tp = list_entry(probe_list.next, struct trace_probe, list);
+                unregister_trace_probe(tp);
+                free_trace_probe(tp);
+        }
+        mutex_unlock(&probe_lock);
+}
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+        mutex_lock(&probe_lock);
+        return seq_list_start(&probe_list, *pos);
+}
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        return seq_list_next(v, &probe_list, pos);
+}
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+        mutex_unlock(&probe_lock);
+}
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+        struct trace_probe *tp = v;
+        int i, ret;
+        char buf[MAX_ARGSTR_LEN + 1];
+        seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+        seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+        if (!tp->symbol)
+                seq_printf(m, " 0x%p", tp->rp.kp.addr);
+        else if (tp->rp.kp.offset)
+                seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+        else
+                seq_printf(m, " %s", probe_symbol(tp));
+        for (i = 0; i < tp->nr_args; i++) {
+                ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
+                if (ret < 0) {
+                        pr_warning("Argument%d decoding error(%d).\n", i, ret);
+                        return ret;
+                }
+                seq_printf(m, " %s=%s", tp->args[i].name, buf);
+        }
+        seq_printf(m, "\n");
+        return 0;
+}
+static const struct seq_operations probes_seq_op = {
+        .start  = probes_seq_start,
+        .next   = probes_seq_next,
+        .stop   = probes_seq_stop,
+        .show   = probes_seq_show
+};
+static int probes_open(struct inode *inode, struct file *file)
+{
+        if ((file->f_mode & FMODE_WRITE) &&
+            (file->f_flags & O_TRUNC))
+                cleanup_all_probes();
+        return seq_open(file, &probes_seq_op);
+}
+static int command_trace_probe(const char *buf)
+{
+        char **argv;
+        int argc = 0, ret = 0;
+        argv = argv_split(GFP_KERNEL, buf, &argc);
+        if (!argv)
+                return -ENOMEM;
+        if (argc)
+                ret = create_trace_probe(argc, argv);
+        argv_free(argv);
+        return ret;
+}
+#define WRITE_BUFSIZE 128
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+                            size_t count, loff_t *ppos)
+{
+        char *kbuf, *tmp;
+        int ret;
+        size_t done;
+        size_t size;
+        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+        if (!kbuf)
+                return -ENOMEM;
+        ret = done = 0;
+        while (done < count) {
+                size = count - done;
+                if (size >= WRITE_BUFSIZE)
+                        size = WRITE_BUFSIZE - 1;
+                if (copy_from_user(kbuf, buffer + done, size)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                kbuf[size] = '\0';
+                tmp = strchr(kbuf, '\n');
+                if (tmp) {
+                        *tmp = '\0';
+                        size = tmp - kbuf + 1;
+                } else if (done + size < count) {
+                        pr_warning("Line length is too long: "
+                                   "Should be less than %d.", WRITE_BUFSIZE);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                done += size;
+                /* Remove comments */
+                tmp = strchr(kbuf, '#');
+                if (tmp)
+                        *tmp = '\0';
+                ret = command_trace_probe(kbuf);
+                if (ret)
+                        goto out;
+        }
+        ret = done;
+out:
+        kfree(kbuf);
+        return ret;
+}
+static const struct file_operations kprobe_events_ops = {
+        .owner          = THIS_MODULE,
+        .open           = probes_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+        .write          = probes_write,
+};
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+        struct trace_probe *tp = v;
+        seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+                   tp->rp.kp.nmissed);
+        return 0;
+}
+static const struct seq_operations profile_seq_op = {
+        .start  = probes_seq_start,
+        .next   = probes_seq_next,
+        .stop   = probes_seq_stop,
+        .show   = probes_profile_seq_show
+};
+static int profile_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &profile_seq_op);
+}
+static const struct file_operations kprobe_profile_ops = {
+        .owner          = THIS_MODULE,
+        .open           = profile_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+/* Kprobe handler */
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+        struct kprobe_trace_entry *entry;
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        int size, i, pc;
+        unsigned long irq_flags;
+        struct ftrace_event_call *call = &tp->call;
+        tp->nhit++;
+        local_save_flags(irq_flags);
+        pc = preempt_count();
+        size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                  irq_flags, pc);
+        if (!event)
+                return;
+        entry = ring_buffer_event_data(event);
+        entry->nargs = tp->nr_args;
+        entry->ip = (unsigned long)kp->addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        if (!filter_current_check_discard(buffer, call, entry, event))
+                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+}
+/* Kretprobe handler */
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
+                                          struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+        struct kretprobe_trace_entry *entry;
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        int size, i, pc;
+        unsigned long irq_flags;
+        struct ftrace_event_call *call = &tp->call;
+        local_save_flags(irq_flags);
+        pc = preempt_count();
+        size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                  irq_flags, pc);
+        if (!event)
+                return;
+        entry = ring_buffer_event_data(event);
+        entry->nargs = tp->nr_args;
+        entry->func = (unsigned long)tp->rp.kp.addr;
+        entry->ret_ip = (unsigned long)ri->ret_addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        if (!filter_current_check_discard(buffer, call, entry, event))
+                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+}
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags)
+{
+        struct kprobe_trace_entry *field;
+        struct trace_seq *s = &iter->seq;
+        struct trace_event *event;
+        struct trace_probe *tp;
+        int i;
+        field = (struct kprobe_trace_entry *)iter->ent;
+        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, event);
+        if (!trace_seq_printf(s, "%s: (", tp->call.name))
+                goto partial;
+        if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+                goto partial;
+        if (!trace_seq_puts(s, ")"))
+                goto partial;
+        for (i = 0; i < field->nargs; i++)
+                if (!trace_seq_printf(s, " %s=%lx",
+                                      tp->args[i].name, field->args[i]))
+                        goto partial;
+        if (!trace_seq_puts(s, "\n"))
+                goto partial;
+        return TRACE_TYPE_HANDLED;
+partial:
+        return TRACE_TYPE_PARTIAL_LINE;
+}
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags)
+{
+        struct kretprobe_trace_entry *field;
+        struct trace_seq *s = &iter->seq;
+        struct trace_event *event;
+        struct trace_probe *tp;
+        int i;
+        field = (struct kretprobe_trace_entry *)iter->ent;
+        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, event);
+        if (!trace_seq_printf(s, "%s: (", tp->call.name))
+                goto partial;
+        if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+                goto partial;
+        if (!trace_seq_puts(s, " <- "))
+                goto partial;
+        if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+                goto partial;
+        if (!trace_seq_puts(s, ")"))
+                goto partial;
+        for (i = 0; i < field->nargs; i++)
+                if (!trace_seq_printf(s, " %s=%lx",
+                                      tp->args[i].name, field->args[i]))
+                        goto partial;
+        if (!trace_seq_puts(s, "\n"))
+                goto partial;
+        return TRACE_TYPE_HANDLED;
+partial:
+        return TRACE_TYPE_PARTIAL_LINE;
+}
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags |= TP_FLAG_TRACE;
+        if (probe_is_return(tp))
+                return enable_kretprobe(&tp->rp);
+        else
+                return enable_kprobe(&tp->rp.kp);
+}
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags &= ~TP_FLAG_TRACE;
+        if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+                if (probe_is_return(tp))
+                        disable_kretprobe(&tp->rp);
+                else
+                        disable_kprobe(&tp->rp.kp);
+        }
+}
+static int probe_event_raw_init(struct ftrace_event_call *event_call)
+{
+        INIT_LIST_HEAD(&event_call->fields);
+        return 0;
+}
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed)                       \
+        do {                                                            \
+                ret = trace_define_field(event_call, #type, name,       \
+                                         offsetof(typeof(field), item), \
+                                         sizeof(field.item), is_signed, \
+                                         FILTER_OTHER);                 \
+                if (ret)                                                \
+                        return ret;                                     \
+        } while (0)
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+        int ret, i;
+        struct kprobe_trace_entry field;
+        struct trace_probe *tp = (struct trace_probe *)event_call->data;
+        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+        /* Set argument names as fields */
+        for (i = 0; i < tp->nr_args; i++)
+                DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+        return 0;
+}
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+        int ret, i;
+        struct kretprobe_trace_entry field;
+        struct trace_probe *tp = (struct trace_probe *)event_call->data;
+        DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+        DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+        /* Set argument names as fields */
+        for (i = 0; i < tp->nr_args; i++)
+                DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+        return 0;
+}
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
+{
+        int i;
+        int pos = 0;
+        const char *fmt, *arg;
+        if (!probe_is_return(tp)) {
+                fmt = "(%lx)";
+                arg = "REC->" FIELD_STRING_IP;
+        } else {
+                fmt = "(%lx <- %lx)";
+                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+        }
+        /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
+        for (i = 0; i < tp->nr_args; i++) {
+                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
+                                tp->args[i].name);
+        }
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+        for (i = 0; i < tp->nr_args; i++) {
+                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+                                tp->args[i].name);
+        }
+#undef LEN_OR_ZERO
+        /* return the length of print_fmt */
+        return pos;
+}
+static int set_print_fmt(struct trace_probe *tp)
+{
+        int len;
+        char *print_fmt;
+        /* First: called with 0 length to calculate the needed length */
+        len = __set_print_fmt(tp, NULL, 0);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
+        /* Second: actually write the @print_fmt */
+        __set_print_fmt(tp, print_fmt, len + 1);
+        tp->call.print_fmt = print_fmt;
+        return 0;
+}
+#ifdef CONFIG_PERF_EVENTS
+/* Kprobe profile handler */
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
+                                         struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+        struct ftrace_event_call *call = &tp->call;
+        struct kprobe_trace_entry *entry;
+        int size, __size, i;
+        unsigned long irq_flags;
+        int rctx;
+        __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+        size = ALIGN(__size + sizeof(u32), sizeof(u64));
+        size -= sizeof(u32);
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+                     "profile buffer not large enough"))
+                return;
+        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+        if (!entry)
+                return;
+        entry->nargs = tp->nr_args;
+        entry->ip = (unsigned long)kp->addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
+}
+/* Kretprobe profile handler */
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
+                                            struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+        struct ftrace_event_call *call = &tp->call;
+        struct kretprobe_trace_entry *entry;
+        int size, __size, i;
+        unsigned long irq_flags;
+        int rctx;
+        __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+        size = ALIGN(__size + sizeof(u32), sizeof(u64));
+        size -= sizeof(u32);
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
+                     "profile buffer not large enough"))
+                return;
+        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+        if (!entry)
+                return;
+        entry->nargs = tp->nr_args;
+        entry->func = (unsigned long)tp->rp.kp.addr;
+        entry->ret_ip = (unsigned long)ri->ret_addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+                               irq_flags, regs);
+}
+static int probe_perf_enable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags |= TP_FLAG_PROFILE;
+        if (probe_is_return(tp))
+                return enable_kretprobe(&tp->rp);
+        else
+                return enable_kprobe(&tp->rp.kp);
+}
+static void probe_perf_disable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags &= ~TP_FLAG_PROFILE;
+        if (!(tp->flags & TP_FLAG_TRACE)) {
+                if (probe_is_return(tp))
+                        disable_kretprobe(&tp->rp);
+                else
+                        disable_kprobe(&tp->rp.kp);
+        }
+}
+#endif  /* CONFIG_PERF_EVENTS */
+static __kprobes
+int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+        if (tp->flags & TP_FLAG_TRACE)
+                kprobe_trace_func(kp, regs);
+#ifdef CONFIG_PERF_EVENTS
+        if (tp->flags & TP_FLAG_PROFILE)
+                kprobe_perf_func(kp, regs);
+#endif
+        return 0;       /* We don't tweek kernel, so just return 0 */
+}
+static __kprobes
+int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+        if (tp->flags & TP_FLAG_TRACE)
+                kretprobe_trace_func(ri, regs);
+#ifdef CONFIG_PERF_EVENTS
+        if (tp->flags & TP_FLAG_PROFILE)
+                kretprobe_perf_func(ri, regs);
+#endif
+        return 0;       /* We don't tweek kernel, so just return 0 */
+}
+static int register_probe_event(struct trace_probe *tp)
+{
+        struct ftrace_event_call *call = &tp->call;
+        int ret;
+        /* Initialize ftrace_event_call */
+        if (probe_is_return(tp)) {
+                tp->event.trace = print_kretprobe_event;
+                call->raw_init = probe_event_raw_init;
+                call->define_fields = kretprobe_event_define_fields;
+        } else {
+                tp->event.trace = print_kprobe_event;
+                call->raw_init = probe_event_raw_init;
+                call->define_fields = kprobe_event_define_fields;
+        }
+        if (set_print_fmt(tp) < 0)
+                return -ENOMEM;
+        call->event = &tp->event;
+        call->id = register_ftrace_event(&tp->event);
+        if (!call->id) {
+                kfree(call->print_fmt);
+                return -ENODEV;
+        }
+        call->enabled = 0;
+        call->regfunc = probe_event_enable;
+        call->unregfunc = probe_event_disable;
+#ifdef CONFIG_PERF_EVENTS
+        call->perf_event_enable = probe_perf_enable;
+        call->perf_event_disable = probe_perf_disable;
+#endif
+        call->data = tp;
+        ret = trace_add_event_call(call);
+        if (ret) {
+                pr_info("Failed to register kprobe event: %s\n", call->name);
+                kfree(call->print_fmt);
+                unregister_ftrace_event(&tp->event);
+        }
+        return ret;
+}
+static void unregister_probe_event(struct trace_probe *tp)
+{
+        /* tp->event is unregistered in trace_remove_event_call() */
+        trace_remove_event_call(&tp->call);
+        kfree(tp->call.print_fmt);
+}
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+        struct dentry *d_tracer;
+        struct dentry *entry;
+        d_tracer = tracing_init_dentry();
+        if (!d_tracer)
+                return 0;
+        entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+                                    NULL, &kprobe_events_ops);
+        /* Event list interface */
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'kprobe_events' entry\n");
+        /* Profile interface */
+        entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+                                    NULL, &kprobe_profile_ops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'kprobe_profile' entry\n");
+        return 0;
+}
+fs_initcall(init_kprobe_trace);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+                                        int a4, int a5, int a6)
+{
+        return a1 + a2 + a3 + a4 + a5 + a6;
+}
+static __init int kprobe_trace_self_tests_init(void)
+{
+        int ret, warn = 0;
+        int (*target)(int, int, int, int, int, int);
+        struct trace_probe *tp;
+        target = kprobe_trace_selftest_target;
+        pr_info("Testing kprobe tracing: ");
+        ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+                                  "$stack $stack0 +0($stack)");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on probing function entry.\n");
+                warn++;
+        } else {
+                /* Enable trace point */
+                tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+                if (WARN_ON_ONCE(tp == NULL)) {
+                        pr_warning("error on getting new probe.\n");
+                        warn++;
+                } else
+                        probe_event_enable(&tp->call);
+        }
+        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+                                  "$retval");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on probing function return.\n");
+                warn++;
+        } else {
+                /* Enable trace point */
+                tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+                if (WARN_ON_ONCE(tp == NULL)) {
+                        pr_warning("error on getting new probe.\n");
+                        warn++;
+                } else
+                        probe_event_enable(&tp->call);
+        }
+        if (warn)
+                goto end;
+        ret = target(1, 2, 3, 4, 5, 6);
+        ret = command_trace_probe("-:testprobe");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on deleting a probe.\n");
+                warn++;
+        }
+        ret = command_trace_probe("-:testprobe2");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on deleting a probe.\n");
+                warn++;
+        }
+end:
+        cleanup_all_probes();
+        if (warn)
+                pr_cont("NG: Some tests are failed. Please check them.\n");
+        else
+                pr_cont("OK\n");
+        return 0;
+}
+late_initcall(kprobe_trace_self_tests_init);
+#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..d59cd6879477
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,520 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include "trace_output.h"
+#include "trace.h"
+#include <linux/hw_breakpoint.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/atomic.h>
+/*
+ * For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HBP_NUM
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+struct trace_ksym {
+        struct perf_event       **ksym_hbp;
+        struct perf_event_attr  attr;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+        atomic64_t              counter;
+#endif
+        struct hlist_node       ksym_hlist;
+};
+static struct trace_array *ksym_trace_array;
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+static HLIST_HEAD(ksym_filter_head);
+static DEFINE_MUTEX(ksym_tracer_mutex);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+#define MAX_UL_INT 0xffffffff
+void ksym_collect_stats(unsigned long hbp_hit_addr)
+{
+        struct hlist_node *node;
+        struct trace_ksym *entry;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
+                if (entry->attr.bp_addr == hbp_hit_addr) {
+                        atomic64_inc(&entry->counter);
+                        break;
+                }
+        }
+        rcu_read_unlock();
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+void ksym_hbp_handler(struct perf_event *hbp, int nmi,
+                      struct perf_sample_data *data,
+                      struct pt_regs *regs)
+{
+        struct ring_buffer_event *event;
+        struct ksym_trace_entry *entry;
+        struct ring_buffer *buffer;
+        int pc;
+        if (!ksym_tracing_enabled)
+                return;
+        buffer = ksym_trace_array->buffer;
+        pc = preempt_count();
+        event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
+                                                        sizeof(*entry), 0, pc);
+        if (!event)
+                return;
+        entry           = ring_buffer_event_data(event);
+        entry->ip       = instruction_pointer(regs);
+        entry->type     = hw_breakpoint_type(hbp);
+        entry->addr     = hw_breakpoint_addr(hbp);
+        strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+        ksym_collect_stats(hw_breakpoint_addr(hbp));
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+        trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *str)
+{
+        int access = 0;
+        if (str[0] == 'r')
+                access |= HW_BREAKPOINT_R;
+        if (str[1] == 'w')
+                access |= HW_BREAKPOINT_W;
+        if (str[2] == 'x')
+                access |= HW_BREAKPOINT_X;
+        switch (access) {
+        case HW_BREAKPOINT_R:
+        case HW_BREAKPOINT_W:
+        case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+                return access;
+        default:
+                return -EINVAL;
+        }
+}
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+                                                        unsigned long *addr)
+{
+        int ret;
+        *ksymname = strsep(&input_string, ":");
+        *addr = kallsyms_lookup_name(*ksymname);
+        /* Check for malformed request: (2), (1) and (5) */
+        if ((!input_string) ||
+            (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
+            (*addr == 0))
+                return -EINVAL;;
+        ret = ksym_trace_get_access_type(input_string);
+        return ret;
+}
+int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+        struct trace_ksym *entry;
+        int ret = -ENOMEM;
+        if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+                printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+                " new requests for tracing can be accepted now.\n",
+                        KSYM_TRACER_MAX);
+                return -ENOSPC;
+        }
+        entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+        if (!entry)
+                return -ENOMEM;
+        hw_breakpoint_init(&entry->attr);
+        entry->attr.bp_type = op;
+        entry->attr.bp_addr = addr;
+        entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
+        entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
+                                        ksym_hbp_handler);
+        if (IS_ERR(entry->ksym_hbp)) {
+                ret = PTR_ERR(entry->ksym_hbp);
+                printk(KERN_INFO "ksym_tracer request failed. Try again"
+                                        " later!!\n");
+                goto err;
+        }
+        hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
+        ksym_filter_entry_count++;
+        return 0;
+err:
+        kfree(entry);
+        return ret;
+}
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+                                                size_t count, loff_t *ppos)
+{
+        struct trace_ksym *entry;
+        struct hlist_node *node;
+        struct trace_seq *s;
+        ssize_t cnt = 0;
+        int ret;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                return -ENOMEM;
+        trace_seq_init(s);
+        mutex_lock(&ksym_tracer_mutex);
+        hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+                ret = trace_seq_printf(s, "%pS:",
+                                (void *)(unsigned long)entry->attr.bp_addr);
+                if (entry->attr.bp_type == HW_BREAKPOINT_R)
+                        ret = trace_seq_puts(s, "r--\n");
+                else if (entry->attr.bp_type == HW_BREAKPOINT_W)
+                        ret = trace_seq_puts(s, "-w-\n");
+                else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
+                        ret = trace_seq_puts(s, "rw-\n");
+                WARN_ON_ONCE(!ret);
+        }
+        cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+        mutex_unlock(&ksym_tracer_mutex);
+        kfree(s);
+        return cnt;
+}
+static void __ksym_trace_reset(void)
+{
+        struct trace_ksym *entry;
+        struct hlist_node *node, *node1;
+        mutex_lock(&ksym_tracer_mutex);
+        hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+                                                                ksym_hlist) {
+                unregister_wide_hw_breakpoint(entry->ksym_hbp);
+                ksym_filter_entry_count--;
+                hlist_del_rcu(&(entry->ksym_hlist));
+                synchronize_rcu();
+                kfree(entry);
+        }
+        mutex_unlock(&ksym_tracer_mutex);
+}
+static ssize_t ksym_trace_filter_write(struct file *file,
+                                        const char __user *buffer,
+                                                size_t count, loff_t *ppos)
+{
+        struct trace_ksym *entry;
+        struct hlist_node *node;
+        char *buf, *input_string, *ksymname = NULL;
+        unsigned long ksym_addr = 0;
+        int ret, op, changed = 0;
+        buf = kzalloc(count + 1, GFP_KERNEL);
+        if (!buf)
+                return -ENOMEM;
+        ret = -EFAULT;
+        if (copy_from_user(buf, buffer, count))
+                goto out;
+        buf[count] = '\0';
+        input_string = strstrip(buf);
+        /*
+         * Clear all breakpoints if:
+         * 1: echo > ksym_trace_filter
+         * 2: echo 0 > ksym_trace_filter
+         * 3: echo "*:---" > ksym_trace_filter
+         */
+        if (!input_string[0] || !strcmp(input_string, "0") ||
+            !strcmp(input_string, "*:---")) {
+                __ksym_trace_reset();
+                ret = 0;
+                goto out;
+        }
+        ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+        if (ret < 0)
+                goto out;
+        mutex_lock(&ksym_tracer_mutex);
+        ret = -EINVAL;
+        hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+                if (entry->attr.bp_addr == ksym_addr) {
+                        /* Check for malformed request: (6) */
+                        if (entry->attr.bp_type != op)
+                                changed = 1;
+                        else
+                                goto out_unlock;
+                        break;
+                }
+        }
+        if (changed) {
+                unregister_wide_hw_breakpoint(entry->ksym_hbp);
+                entry->attr.bp_type = op;
+                ret = 0;
+                if (op > 0) {
+                        entry->ksym_hbp =
+                                register_wide_hw_breakpoint(&entry->attr,
+                                        ksym_hbp_handler);
+                        if (IS_ERR(entry->ksym_hbp))
+                                ret = PTR_ERR(entry->ksym_hbp);
+                        else
+                                goto out_unlock;
+                }
+                /* Error or "symbol:---" case: drop it */
+                ksym_filter_entry_count--;
+                hlist_del_rcu(&(entry->ksym_hlist));
+                synchronize_rcu();
+                kfree(entry);
+                goto out_unlock;
+        } else {
+                /* Check for malformed request: (4) */
+                if (op)
+                        ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+        }
+out_unlock:
+        mutex_unlock(&ksym_tracer_mutex);
+out:
+        kfree(buf);
+        return !ret ? count : ret;
+}
+static const struct file_operations ksym_tracing_fops = {
+        .open           = tracing_open_generic,
+        .read           = ksym_trace_filter_read,
+        .write          = ksym_trace_filter_write,
+};
+static void ksym_trace_reset(struct trace_array *tr)
+{
+        ksym_tracing_enabled = 0;
+        __ksym_trace_reset();
+}
+static int ksym_trace_init(struct trace_array *tr)
+{
+        int cpu, ret = 0;
+        for_each_online_cpu(cpu)
+                tracing_reset(tr, cpu);
+        ksym_tracing_enabled = 1;
+        ksym_trace_array = tr;
+        return ret;
+}
+static void ksym_trace_print_header(struct seq_file *m)
+{
+        seq_puts(m,
+                 "#       TASK-PID   CPU#      Symbol                    "
+                 "Type    Function\n");
+        seq_puts(m,
+                 "#          |        |          |                       "
+                 " |         |\n");
+}
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+        struct trace_entry *entry = iter->ent;
+        struct trace_seq *s = &iter->seq;
+        struct ksym_trace_entry *field;
+        char str[KSYM_SYMBOL_LEN];
+        int ret;
+        if (entry->type != TRACE_KSYM)
+                return TRACE_TYPE_UNHANDLED;
+        trace_assign_type(field, entry);
+        ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
+                                entry->pid, iter->cpu, (char *)field->addr);
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        switch (field->type) {
+        case HW_BREAKPOINT_R:
+                ret = trace_seq_printf(s, " R  ");
+                break;
+        case HW_BREAKPOINT_W:
+                ret = trace_seq_printf(s, " W  ");
+                break;
+        case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+                ret = trace_seq_printf(s, " RW ");
+                break;
+        default:
+                return TRACE_TYPE_PARTIAL_LINE;
+        }
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        sprint_symbol(str, field->ip);
+        ret = trace_seq_printf(s, "%s\n", str);
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        return TRACE_TYPE_HANDLED;
+}
+struct tracer ksym_tracer __read_mostly =
+{
+        .name           = "ksym_tracer",
+        .init           = ksym_trace_init,
+        .reset          = ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest       = trace_selftest_startup_ksym,
+#endif
+        .print_header   = ksym_trace_print_header,
+        .print_line     = ksym_trace_output
+};
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_profile_show(struct seq_file *m, void *v)
+{
+        struct hlist_node *node;
+        struct trace_ksym *entry;
+        int access_type = 0;
+        char fn_name[KSYM_NAME_LEN];
+        seq_puts(m, "  Access Type ");
+        seq_puts(m, "  Symbol                                       Counter\n");
+        seq_puts(m, "  ----------- ");
+        seq_puts(m, "  ------                                       -------\n");
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
+                access_type = entry->attr.bp_type;
+                switch (access_type) {
+                case HW_BREAKPOINT_R:
+                        seq_puts(m, "  R           ");
+                        break;
+                case HW_BREAKPOINT_W:
+                        seq_puts(m, "  W           ");
+                        break;
+                case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+                        seq_puts(m, "  RW          ");
+                        break;
+                default:
+                        seq_puts(m, "  NA          ");
+                }
+                if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
+                        seq_printf(m, "  %-36s", fn_name);
+                else
+                        seq_printf(m, "  %-36s", "<NA>");
+                seq_printf(m, " %15llu\n",
+                           (unsigned long long)atomic64_read(&entry->counter));
+        }
+        rcu_read_unlock();
+        return 0;
+}
+static int ksym_profile_open(struct inode *node, struct file *file)
+{
+        return single_open(file, ksym_profile_show, NULL);
+}
+static const struct file_operations ksym_profile_fops = {
+        .open           = ksym_profile_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+__init static int init_ksym_trace(void)
+{
+        struct dentry *d_tracer;
+        d_tracer = tracing_init_dentry();
+        trace_create_file("ksym_trace_filter", 0644, d_tracer,
+                          NULL, &ksym_tracing_fops);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+        trace_create_file("ksym_profile", 0444, d_tracer,
+                          NULL, &ksym_profile_fops);
+#endif
+        return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <asm/atomic.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bcd..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+        int ret;
+        ret = seq_write(m, s->buffer, len);
-        seq_write(m, s->buffer, len);
+        /*
+         * Only reset this buffer if we successfully wrote to the
+         * seq_file buffer.
+         */
+        if (!ret)
+                trace_seq_init(s);
-        trace_seq_init(s);
+        return ret;
 }
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_list ap;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_end(ap);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 {
        int len = strlen(str);
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return 0;
+        }
        memcpy(s->buffer + s->len, str, len);
        s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 int trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
+                return 0;
+        }
        s->buffer[s->len++] = c;
        return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
+                return 0;
+        }
        memcpy(s->buffer + s->len, mem, len);
        s->len += len;
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
        const unsigned char *data = mem;
        int i, j;
+        if (s->full)
+                return 0;
 #ifdef __BIG_ENDIAN
        for (i = 0, j = 0; i < len; i++) {
 #else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 {
        void *ret;
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return NULL;
+        }
        ret = s->buffer + s->len;
        s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 {
        unsigned char *p;
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
+                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
                return 0;
+        }
        p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
        if (!IS_ERR(p)) {
                p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
                return 1;
        }
+        s->full = 1;
        return 0;
 }
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
        unsigned long vmstart = 0;
        int ret = 1;
+        if (s->full)
+                return 0;
        if (mm) {
                const struct vm_area_struct *vma;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d727676..0271742abb8d 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int			wakeup_current_cpu;
 static unsigned                 wakeup_prio = -1;
 static int                      wakeup_rt;
-static raw_spinlock_t wakeup_lock =
+static arch_spinlock_t wakeup_lock =
-        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 static void __wakeup_reset(struct trace_array *tr);
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
                goto out;
        local_irq_save(flags);
-        __raw_spin_lock(&wakeup_lock);
+        arch_spin_lock(&wakeup_lock);
        /* We could race with grabbing wakeup_lock */
        if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 out_unlock:
        __wakeup_reset(wakeup_trace);
-        __raw_spin_unlock(&wakeup_lock);
+        arch_spin_unlock(&wakeup_lock);
        local_irq_restore(flags);
 out:
        atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
        tracing_reset_online_cpus(tr);
        local_irq_save(flags);
-        __raw_spin_lock(&wakeup_lock);
+        arch_spin_lock(&wakeup_lock);
        __wakeup_reset(tr);
-        __raw_spin_unlock(&wakeup_lock);
+        arch_spin_unlock(&wakeup_lock);
        local_irq_restore(flags);
 }
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
                goto out;
        /* interrupts should be off from try_to_wake_up */
-        __raw_spin_lock(&wakeup_lock);
+        arch_spin_lock(&wakeup_lock);
        /* check for races. */
        if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
        trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 out_locked:
-        __raw_spin_unlock(&wakeup_lock);
+        arch_spin_unlock(&wakeup_lock);
 out:
        atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..81003b4d617f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
 #include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 static inline int trace_valid_entry(struct trace_entry *entry)
 {
@@ -17,6 +18,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_GRAPH_ENT:
        case TRACE_GRAPH_RET:
        case TRACE_HW_BRANCHES:
+        case TRACE_KSYM:
                return 1;
        }
        return 0;
@@ -66,7 +68,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
        /* Don't allow flipping of max traces now */
        local_irq_save(flags);
-        __raw_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&ftrace_max_lock);
        cnt = ring_buffer_entries(tr->buffer);
@@ -84,7 +86,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
                        break;
        }
        tracing_on();
-        __raw_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&ftrace_max_lock);
        local_irq_restore(flags);
        if (count)
@@ -808,3 +810,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
        return ret;
 }
 #endif /* CONFIG_HW_BRANCH_TRACER */
+#ifdef CONFIG_KSYM_TRACER
+static int ksym_selftest_dummy;
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long count;
+        int ret;
+        /* start the tracing */
+        ret = tracer_init(trace, tr);
+        if (ret) {
+                warn_failed_init_tracer(trace, ret);
+                return ret;
+        }
+        ksym_selftest_dummy = 0;
+        /* Register the read-write tracing request */
+        ret = process_new_ksym_entry("ksym_selftest_dummy",
+                                     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+                                        (unsigned long)(&ksym_selftest_dummy));
+        if (ret < 0) {
+                printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+                goto ret_path;
+        }
+        /* Perform a read and a write operation over the dummy variable to
+         * trigger the tracer
+         */
+        if (ksym_selftest_dummy == 0)
+                ksym_selftest_dummy++;
+        /* stop the tracing. */
+        tracing_stop();
+        /* check the trace buffer */
+        ret = trace_test_buffer(tr, &count);
+        trace->reset(tr);
+        tracing_start();
+        /* read & write operations - one each is performed on the dummy variable
+         * triggering two entries in the trace buffer
+         */
+        if (!ret && count != 2) {
+                printk(KERN_CONT "Ksym tracer startup test failed");
+                ret = -1;
+        }
+ret_path:
+        return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8504ac71e4e8..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
 };
 static unsigned long max_stack_size;
-static raw_spinlock_t max_stack_lock =
+static arch_spinlock_t max_stack_lock =
-        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
                return;
        local_irq_save(flags);
-        __raw_spin_lock(&max_stack_lock);
+        arch_spin_lock(&max_stack_lock);
        /* a race could have already updated it */
        if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
        }
 out:
-        __raw_spin_unlock(&max_stack_lock);
+        arch_spin_unlock(&max_stack_lock);
        local_irq_restore(flags);
 }
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        unsigned long val, flags;
        char buf[64];
        int ret;
+        int cpu;
        if (count >= sizeof(buf))
                return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
                return ret;
        local_irq_save(flags);
-        __raw_spin_lock(&max_stack_lock);
+        /*
+         * In case we trace inside arch_spin_lock() or after (NMI),
+         * we will cause circular lock, so we also need to increase
+         * the percpu trace_active here.
+         */
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
+        arch_spin_lock(&max_stack_lock);
        *ptr = val;
-        __raw_spin_unlock(&max_stack_lock);
+        arch_spin_unlock(&max_stack_lock);
+        per_cpu(trace_active, cpu)--;
        local_irq_restore(flags);
        return count;
@@ -206,8 +218,14 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+        int cpu;
        local_irq_disable();
-        __raw_spin_lock(&max_stack_lock);
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)++;
+        arch_spin_lock(&max_stack_lock);
        if (*pos == 0)
                return SEQ_START_TOKEN;
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void t_stop(struct seq_file *m, void *p)
 {
-        __raw_spin_unlock(&max_stack_lock);
+        int cpu;
+        arch_spin_unlock(&max_stack_lock);
+        cpu = smp_processor_id();
+        per_cpu(trace_active, cpu)--;
        local_irq_enable();
 }
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 527e17eae575..4d6d711717f2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
 #include <trace/syscall.h>
 #include <trace/events/syscalls.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
@@ -14,6 +15,43 @@ static int sys_refcount_exit;
 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+extern unsigned long __start_syscalls_metadata[];
+extern unsigned long __stop_syscalls_metadata[];
+static struct syscall_metadata **syscalls_metadata;
+static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+{
+        struct syscall_metadata *start;
+        struct syscall_metadata *stop;
+        char str[KSYM_SYMBOL_LEN];
+        start = (struct syscall_metadata *)__start_syscalls_metadata;
+        stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+        kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+        for ( ; start < stop; start++) {
+                /*
+                 * Only compare after the "sys" prefix. Archs that use
+                 * syscall wrappers may have syscalls symbols aliases prefixed
+                 * with "SyS" instead of "sys", leading to an unwanted
+                 * mismatch.
+                 */
+                if (start->name && !strcmp(start->name + 3, str + 3))
+                        return start;
+        }
+        return NULL;
+}
+static struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+        if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
+                return NULL;
+        return syscalls_metadata[nr];
+}
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
@@ -30,7 +68,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
        if (!entry)
                goto end;
-        if (entry->enter_id != ent->type) {
+        if (entry->enter_event->id != ent->type) {
                WARN_ON_ONCE(1);
                goto end;
        }
@@ -85,7 +123,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
                return TRACE_TYPE_HANDLED;
        }
-        if (entry->exit_id != ent->type) {
+        if (entry->exit_event->id != ent->type) {
                WARN_ON_ONCE(1);
                return TRACE_TYPE_UNHANDLED;
        }
@@ -103,92 +141,79 @@ extern char *__bad_type_size(void);
 #define SYSCALL_FIELD(type, name)                                       \
        sizeof(type) != sizeof(trace.name) ?                            \
                __bad_type_size() :                                     \
-                #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
+                #type, #name, offsetof(typeof(trace), name),            \
+                sizeof(trace.name), is_signed_type(type)
-int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
+static
+int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
        int i;
-        int nr;
+        int pos = 0;
-        int ret;
-        struct syscall_metadata *entry;
-        struct syscall_trace_enter trace;
-        int offset = offsetof(struct syscall_trace_enter, args);
-        nr = syscall_name_to_nr(call->data);
+        /* When len=0, we just calculate the needed length */
-        entry = syscall_nr_to_meta(nr);
+#define LEN_OR_ZERO (len ? len - pos : 0)
-        if (!entry)
-                return 0;
-        ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
-                               SYSCALL_FIELD(int, nr));
-        if (!ret)
-                return 0;
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
-                                        entry->args[i]);
+                                entry->args[i], sizeof(unsigned long),
-                if (!ret)
+                                i == entry->nb_args - 1 ? "" : ", ");
-                        return 0;
-                ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
-                                       sizeof(unsigned long));
-                if (!ret)
-                        return 0;
-                offset += sizeof(unsigned long);
        }
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
-        trace_seq_puts(s, "\nprint fmt: \"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO,
-                                        sizeof(unsigned long),
+                                ", ((unsigned long)(REC->%s))", entry->args[i]);
-                                        i == entry->nb_args - 1 ? "" : ", ");
-                if (!ret)
-                        return 0;
        }
-        trace_seq_putc(s, '"');
-        for (i = 0; i < entry->nb_args; i++) {
+#undef LEN_OR_ZERO
-                ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
-                                       entry->args[i]);
-                if (!ret)
-                        return 0;
-        }
-        return trace_seq_putc(s, '\n');
+        /* return the length of print_fmt */
+        return pos;
 }
-int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
+static int set_syscall_print_fmt(struct ftrace_event_call *call)
 {
-        int ret;
+        char *print_fmt;
-        struct syscall_trace_exit trace;
+        int len;
+        struct syscall_metadata *entry = call->data;
-        ret = trace_seq_printf(s,
+        if (entry->enter_event != call) {
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+                call->print_fmt = "\"0x%lx\", REC->ret";
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
-                               SYSCALL_FIELD(int, nr),
-                               SYSCALL_FIELD(long, ret));
-        if (!ret)
                return 0;
+        }
+        /* First: called with 0 length to calculate the needed length */
+        len = __set_enter_print_fmt(entry, NULL, 0);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
+        /* Second: actually write the @print_fmt */
+        __set_enter_print_fmt(entry, print_fmt, len + 1);
+        call->print_fmt = print_fmt;
+        return 0;
+}
+static void free_syscall_print_fmt(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
-        return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
+        if (entry->enter_event == call)
+                kfree(call->print_fmt);
 }
 int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_enter trace;
-        struct syscall_metadata *meta;
+        struct syscall_metadata *meta = call->data;
        int ret;
-        int nr;
        int i;
        int offset = offsetof(typeof(trace), args);
-        nr = syscall_name_to_nr(call->data);
+        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
-        meta = syscall_nr_to_meta(nr);
-        if (!meta)
-                return 0;
-        ret = trace_define_common_fields(call);
        if (ret)
                return ret;
@@ -208,11 +233,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
        struct syscall_trace_exit trace;
        int ret;
-        ret = trace_define_common_fields(call);
+        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
        if (ret)
                return ret;
-        ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0,
+        ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
                                 FILTER_OTHER);
        return ret;
@@ -239,8 +264,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
-        event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
+        event = trace_current_buffer_lock_reserve(&buffer,
-                                                  size, 0, 0);
+                        sys_data->enter_event->id, size, 0, 0);
        if (!event)
                return;
@@ -271,8 +296,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
        if (!sys_data)
                return;
-        event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
+        event = trace_current_buffer_lock_reserve(&buffer,
-                                sizeof(*entry), 0, 0);
+                        sys_data->exit_event->id, sizeof(*entry), 0, 0);
        if (!event)
                return;
@@ -285,23 +310,18 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-int reg_event_syscall_enter(void *ptr)
+int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_enter)
                ret = register_trace_sys_enter(ftrace_syscall_enter);
-        if (ret) {
+        if (!ret) {
-                pr_info("event trace: Could not activate"
-                                "syscall entry trace point");
-        } else {
                set_bit(num, enabled_enter_syscalls);
                sys_refcount_enter++;
        }
@@ -309,13 +329,11 @@ int reg_event_syscall_enter(void *ptr)
        return ret;
 }
-void unreg_event_syscall_enter(void *ptr)
+void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return;
        mutex_lock(&syscall_trace_lock);
@@ -326,23 +344,18 @@ void unreg_event_syscall_enter(void *ptr)
        mutex_unlock(&syscall_trace_lock);
 }
-int reg_event_syscall_exit(void *ptr)
+int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_exit)
                ret = register_trace_sys_exit(ftrace_syscall_exit);
-        if (ret) {
+        if (!ret) {
-                pr_info("event trace: Could not activate"
-                                "syscall exit trace point");
-        } else {
                set_bit(num, enabled_exit_syscalls);
                sys_refcount_exit++;
        }
@@ -350,13 +363,11 @@ int reg_event_syscall_exit(void *ptr)
        return ret;
 }
-void unreg_event_syscall_exit(void *ptr)
+void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return;
        mutex_lock(&syscall_trace_lock);
@@ -367,33 +378,73 @@ void unreg_event_syscall_exit(void *ptr)
        mutex_unlock(&syscall_trace_lock);
 }
-struct trace_event event_syscall_enter = {
+int init_syscall_trace(struct ftrace_event_call *call)
-        .trace                  = print_syscall_enter,
+{
-};
+        int id;
+        if (set_syscall_print_fmt(call) < 0)
+                return -ENOMEM;
-struct trace_event event_syscall_exit = {
+        id = trace_event_raw_init(call);
-        .trace                  = print_syscall_exit,
-};
-#ifdef CONFIG_EVENT_PROFILE
+        if (id < 0) {
+                free_syscall_print_fmt(call);
+                return id;
+        }
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
+        return id;
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
+}
-static int sys_prof_refcount_enter;
-static int sys_prof_refcount_exit;
+unsigned long __init arch_syscall_addr(int nr)
+{
+        return (unsigned long)sys_call_table[nr];
+}
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+int __init init_ftrace_syscalls(void)
+{
+        struct syscall_metadata *meta;
+        unsigned long addr;
+        int i;
+        syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
+                                        NR_syscalls, GFP_KERNEL);
+        if (!syscalls_metadata) {
+                WARN_ON(1);
+                return -ENOMEM;
+        }
+        for (i = 0; i < NR_syscalls; i++) {
+                addr = arch_syscall_addr(i);
+                meta = find_syscall_meta(addr);
+                if (!meta)
+                        continue;
+                meta->syscall_nr = i;
+                syscalls_metadata[i] = meta;
+        }
+        return 0;
+}
+core_initcall(init_ftrace_syscalls);
+#ifdef CONFIG_PERF_EVENTS
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
+static int sys_perf_refcount_enter;
+static int sys_perf_refcount_exit;
+static void perf_syscall_enter(struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
        unsigned long flags;
-        char *raw_data;
        int syscall_nr;
+        int rctx;
        int size;
-        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -405,91 +456,67 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                      "profile buffer not large enough"))
+                      "perf buffer not large enough"))
                return;
-        /* Protect the per cpu buffer, begin the rcu read side */
+        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-        local_irq_save(flags);
+                                sys_data->enter_event->id, &rctx, &flags);
+        if (!rec)
-        cpu = smp_processor_id();
+                return;
-        if (in_nmi())
-                raw_data = rcu_dereference(trace_profile_buf_nmi);
-        else
-                raw_data = rcu_dereference(trace_profile_buf);
-        if (!raw_data)
-                goto end;
-        raw_data = per_cpu_ptr(raw_data, cpu);
-        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        rec = (struct syscall_trace_enter *) raw_data;
-        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->enter_id;
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-        perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
-end:
-        local_irq_restore(flags);
 }
-int reg_prof_syscall_enter(char *name)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                ret = register_trace_sys_enter(prof_syscall_enter);
+                ret = register_trace_sys_enter(perf_syscall_enter);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall entry trace point");
        } else {
-                set_bit(num, enabled_prof_enter_syscalls);
+                set_bit(num, enabled_perf_enter_syscalls);
-                sys_prof_refcount_enter++;
+                sys_perf_refcount_enter++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void unreg_prof_syscall_enter(char *name)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_enter--;
+        sys_perf_refcount_enter--;
-        clear_bit(num, enabled_prof_enter_syscalls);
+        clear_bit(num, enabled_perf_enter_syscalls);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                unregister_trace_sys_enter(prof_syscall_enter);
+                unregister_trace_sys_enter(perf_syscall_enter);
        mutex_unlock(&syscall_trace_lock);
 }
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
        unsigned long flags;
        int syscall_nr;
-        char *raw_data;
+        int rctx;
        int size;
-        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -504,79 +531,55 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
         * Impossible, but be paranoid with the future
         * How to put this check outside runtime?
         */
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                "exit event has grown above profile buffer size"))
+                "exit event has grown above perf buffer size"))
                return;
-        /* Protect the per cpu buffer, begin the rcu read side */
+        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-        local_irq_save(flags);
+                                sys_data->exit_event->id, &rctx, &flags);
-        cpu = smp_processor_id();
+        if (!rec)
+                return;
-        if (in_nmi())
-                raw_data = rcu_dereference(trace_profile_buf_nmi);
-        else
-                raw_data = rcu_dereference(trace_profile_buf);
-        if (!raw_data)
-                goto end;
-        raw_data = per_cpu_ptr(raw_data, cpu);
-        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        rec = (struct syscall_trace_exit *)raw_data;
-        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->exit_id;
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
-        perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
-end:
-        local_irq_restore(flags);
 }
-int reg_prof_syscall_exit(char *name)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                ret = register_trace_sys_exit(prof_syscall_exit);
+                ret = register_trace_sys_exit(perf_syscall_exit);
        if (ret) {
                pr_info("event trace: Could not activate"
-                                "syscall entry trace point");
+                                "syscall exit trace point");
        } else {
-                set_bit(num, enabled_prof_exit_syscalls);
+                set_bit(num, enabled_perf_exit_syscalls);
-                sys_prof_refcount_exit++;
+                sys_perf_refcount_exit++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void unreg_prof_syscall_exit(char *name)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_exit--;
+        sys_perf_refcount_exit--;
-        clear_bit(num, enabled_prof_exit_syscalls);
+        clear_bit(num, enabled_perf_exit_syscalls);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                unregister_trace_sys_exit(prof_syscall_exit);
+                unregister_trace_sys_exit(perf_syscall_exit);
        mutex_unlock(&syscall_trace_lock);
 }
-#endif
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f6693969287d..a7974a552ca9 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = {
        .warning_symbol         = backtrace_warning_symbol,
        .stack                  = backtrace_stack,
        .address                = backtrace_address,
+        .walk_stack             = print_context_stack,
 };
 static int
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..cc2d2faa7d9e 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
 #include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/kref.h>
 #include "trace_stat.h"
 #include "trace.h"
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
+#include <linux/mm.h>
 /*
 * fill in basic accounting fields
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 000000000000..eb27fd3430a2
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,44 @@
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+/*
+ * Request a notification when the current cpu returns to userspace.  Must be
+ * called in atomic context.  The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+        set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+        hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+/*
+ * Removes a registered user return notifier.  Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+        hlist_del(&urn->link);
+        if (hlist_empty(&__get_cpu_var(return_notifier_list)))
+                clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+        struct user_return_notifier *urn;
+        struct hlist_node *tmp1, *tmp2;
+        struct hlist_head *head;
+        head = &get_cpu_var(return_notifier_list);
+        hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+                urn->on_user_return(urn);
+        put_cpu_var(return_notifier_list);
+}
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
        .user_ns        = &init_user_ns,
-#ifdef CONFIG_USER_SCHED
-        .tg             = &init_task_group,
-#endif
 };
 /*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
        put_user_ns(up->user_ns);
 }
-#ifdef CONFIG_USER_SCHED
-static void sched_destroy_user(struct user_struct *up)
-{
-        sched_destroy_group(up->tg);
-}
-static int sched_create_user(struct user_struct *up)
-{
-        int rc = 0;
-        up->tg = sched_create_group(&root_task_group);
-        if (IS_ERR(up->tg))
-                rc = -ENOMEM;
-        set_tg_uid(up);
-        return rc;
-}
-#else   /* CONFIG_USER_SCHED */
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-#endif  /* CONFIG_USER_SCHED */
-#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
-                        /* possibly resurrect an "almost deleted" object */
-                        if (atomic_inc_return(&user->__count) == 1)
-                                cancel_delayed_work(&user->work);
-                        return user;
-                }
-        }
-        return NULL;
-}
-static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
-static DEFINE_MUTEX(uids_mutex);
-static inline void uids_mutex_lock(void)
-{
-        mutex_lock(&uids_mutex);
-}
-static inline void uids_mutex_unlock(void)
-{
-        mutex_unlock(&uids_mutex);
-}
-/* uid directory attributes */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static ssize_t cpu_shares_show(struct kobject *kobj,
-                               struct kobj_attribute *attr,
-                               char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
-}
-static ssize_t cpu_shares_store(struct kobject *kobj,
-                                struct kobj_attribute *attr,
-                                const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long shares;
-        int rc;
-        sscanf(buf, "%lu", &shares);
-        rc = sched_group_set_shares(up->tg, shares);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_share_attr =
-        __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
-}
-static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_runtime;
-        int rc;
-        sscanf(buf, "%ld", &rt_runtime);
-        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_runtime_attr =
-        __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
-static ssize_t cpu_rt_period_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
-}
-static ssize_t cpu_rt_period_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_period;
-        int rc;
-        sscanf(buf, "%lu", &rt_period);
-        rc = sched_group_set_rt_period(up->tg, rt_period);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_period_attr =
-        __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
-#endif
-/* default attributes per uid directory */
-static struct attribute *uids_attributes[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        &cpu_share_attr.attr,
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        &cpu_rt_runtime_attr.attr,
-        &cpu_rt_period_attr.attr,
-#endif
-        NULL
-};
-/* the lifetime of user_struct is not managed by the core (now) */
-static void uids_release(struct kobject *kobj)
-{
-        return;
-}
-static struct kobj_type uids_ktype = {
-        .sysfs_ops = &kobj_sysfs_ops,
-        .default_attrs = uids_attributes,
-        .release = uids_release,
-};
-/*
- * Create /sys/kernel/uids/<uid>/cpu_share file for this user
- * We do not create this file for users in a user namespace (until
- * sysfs tagging is implemented).
- *
- * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
- */
-static int uids_user_create(struct user_struct *up)
-{
-        struct kobject *kobj = &up->kobj;
-        int error;
-        memset(kobj, 0, sizeof(struct kobject));
-        if (up->user_ns != &init_user_ns)
-                return 0;
-        kobj->kset = uids_kset;
-        error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
-        if (error) {
-                kobject_put(kobj);
-                goto done;
-        }
-        kobject_uevent(kobj, KOBJ_ADD);
-done:
-        return error;
-}
-/* create these entries in sysfs:
- *      "/sys/kernel/uids" directory
- *      "/sys/kernel/uids/0" directory (for root user)
- *      "/sys/kernel/uids/0/cpu_share" file (for root user)
- */
-int __init uids_sysfs_init(void)
-{
-        uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
-        if (!uids_kset)
-                return -ENOMEM;
-        return uids_user_create(&root_user);
-}
-/* delayed work function to remove sysfs directory for a user and free up
- * corresponding structures.
- */
-static void cleanup_user_struct(struct work_struct *w)
-{
-        struct user_struct *up = container_of(w, struct user_struct, work.work);
-        unsigned long flags;
-        int remove_user = 0;
-        /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
-         * atomic.
-         */
-        uids_mutex_lock();
-        spin_lock_irqsave(&uidhash_lock, flags);
-        if (atomic_read(&up->__count) == 0) {
-                uid_hash_remove(up);
-                remove_user = 1;
-        }
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-        if (!remove_user)
-                goto done;
-        if (up->user_ns == &init_user_ns) {
-                kobject_uevent(&up->kobj, KOBJ_REMOVE);
-                kobject_del(&up->kobj);
-                kobject_put(&up->kobj);
-        }
-        sched_destroy_user(up);
-        key_put(up->uid_keyring);
-        key_put(up->session_keyring);
-        kmem_cache_free(uid_cachep, up);
-done:
-        uids_mutex_unlock();
-}
-/* IRQs are disabled and uidhash_lock is held upon function entry.
- * IRQ state (as stored in flags) is restored and uidhash_lock released
- * upon function exit.
- */
-static void free_user(struct user_struct *up, unsigned long flags)
-{
-        INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-        schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-}
-#else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
 static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
        return NULL;
 }
-int uids_sysfs_init(void) { return 0; }
-static inline int uids_user_create(struct user_struct *up) { return 0; }
-static inline void uids_mutex_lock(void) { }
-static inline void uids_mutex_unlock(void) { }
 /* IRQs are disabled and uidhash_lock is held upon function entry.
 * IRQ state (as stored in flags) is restored and uidhash_lock released
 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
-        sched_destroy_user(up);
        key_put(up->uid_keyring);
        key_put(up->session_keyring);
        kmem_cache_free(uid_cachep, up);
 }
-#endif
-#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
-/*
- * We need to check if a setuid can take place. This function should be called
- * before successfully completing the setuid.
- */
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return sched_rt_can_attach(up->tg, tsk);
-}
-#else
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return 1;
-}
-#endif
 /*
 * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
         * atomic.
         */
-        uids_mutex_lock();
        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                new->uid = uid;
                atomic_set(&new->__count, 1);
-                if (sched_create_user(new) < 0)
-                        goto out_free_user;
                new->user_ns = get_user_ns(ns);
-                if (uids_user_create(new))
-                        goto out_destoy_sched;
                /*
                 * Before adding this, check whether we raced
                 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_unlock_irq(&uidhash_lock);
        }
-        uids_mutex_unlock();
        return up;
-out_destoy_sched:
-        sched_destroy_user(new);
        put_user_ns(new->user_ns);
-out_free_user:
        kmem_cache_free(uid_cachep, new);
 out_unlock:
-        uids_mutex_unlock();
        return NULL;
 }
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 69eae358a726..a2cd77e70d4d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write,
 #define proc_do_uts_string NULL
 #endif
-#ifdef CONFIG_SYSCTL_SYSCALL
-/* The generic string strategy routine: */
-static int sysctl_uts_string(ctl_table *table,
-                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen)
-{
-        struct ctl_table uts_table;
-        int r, write;
-        write = newval && newlen;
-        memcpy(&uts_table, table, sizeof(uts_table));
-        uts_table.data = get_uts(table, write);
-        r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
-        put_uts(table, write, uts_table.data);
-        return r;
-}
-#else
-#define sysctl_uts_string NULL
-#endif
 static struct ctl_table uts_kern_table[] = {
        {
-                .ctl_name       = KERN_OSTYPE,
                .procname       = "ostype",
                .data           = init_uts_ns.name.sysname,
                .maxlen         = sizeof(init_uts_ns.name.sysname),
                .mode           = 0444,
                .proc_handler   = proc_do_uts_string,
-                .strategy       = sysctl_uts_string,
        },
        {
-                .ctl_name       = KERN_OSRELEASE,
                .procname       = "osrelease",
                .data           = init_uts_ns.name.release,
                .maxlen         = sizeof(init_uts_ns.name.release),
                .mode           = 0444,
                .proc_handler   = proc_do_uts_string,
-                .strategy       = sysctl_uts_string,
        },
        {
-                .ctl_name       = KERN_VERSION,
                .procname       = "version",
                .data           = init_uts_ns.name.version,
                .maxlen         = sizeof(init_uts_ns.name.version),
                .mode           = 0444,
                .proc_handler   = proc_do_uts_string,
-                .strategy       = sysctl_uts_string,
        },
        {
-                .ctl_name       = KERN_NODENAME,
                .procname       = "hostname",
                .data           = init_uts_ns.name.nodename,
                .maxlen         = sizeof(init_uts_ns.name.nodename),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
-                .strategy       = sysctl_uts_string,
        },
        {
-                .ctl_name       = KERN_DOMAINNAME,
                .procname       = "domainname",
                .data           = init_uts_ns.name.domainname,
                .maxlen         = sizeof(init_uts_ns.name.domainname),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
-                .strategy       = sysctl_uts_string,
        },
        {}
 };
 static struct ctl_table uts_root_table[] = {
        {
-                .ctl_name       = CTL_KERN,
                .procname       = "kernel",
                .mode           = 0555,
                .child          = uts_kern_table,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..5bfb213984b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
 #endif
 };
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+static struct debug_obj_descr work_debug_descr;
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int work_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_init(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int work_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                /*
+                 * This is not really a fixup. The work struct was
+                 * statically initialized. We just make sure that it
+                 * is tracked in the object tracker.
+                 */
+                if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+                        debug_object_init(work, &work_debug_descr);
+                        debug_object_activate(work, &work_debug_descr);
+                        return 0;
+                }
+                WARN_ON_ONCE(1);
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                WARN_ON(1);
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int work_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_free(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static struct debug_obj_descr work_debug_descr = {
+        .name           = "work_struct",
+        .fixup_init     = work_fixup_init,
+        .fixup_activate = work_fixup_activate,
+        .fixup_free     = work_fixup_free,
+};
+static inline void debug_work_activate(struct work_struct *work)
+{
+        debug_object_activate(work, &work_debug_descr);
+}
+static inline void debug_work_deactivate(struct work_struct *work)
+{
+        debug_object_deactivate(work, &work_debug_descr);
+}
+void __init_work(struct work_struct *work, int onstack)
+{
+        if (onstack)
+                debug_object_init_on_stack(work, &work_debug_descr);
+        else
+                debug_object_init(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(__init_work);
+void destroy_work_on_stack(struct work_struct *work)
+{
+        debug_object_free(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+#else
+static inline void debug_work_activate(struct work_struct *work) { }
+static inline void debug_work_deactivate(struct work_struct *work) { }
+#endif
 /* Serializes the accesses to the list of workqueues. */
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 {
        unsigned long flags;
+        debug_work_activate(work);
        spin_lock_irqsave(&cwq->lock, flags);
        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
                trace_workqueue_execution(cwq->thread, work);
+                debug_work_deactivate(work);
                cwq->current_work = work;
                list_del_init(cwq->worklist.next);
                spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
                        struct wq_barrier *barr, struct list_head *head)
 {
-        INIT_WORK(&barr->work, wq_barrier_func);
+        /*
+         * debugobject calls are safe here even with cwq->lock locked
+         * as we know for sure that this will not trigger any of the
+         * checks and call back into the fixup functions where we
+         * might deadlock.
+         */
+        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        debug_work_activate(&barr->work);
        insert_work(cwq, &barr->work, head);
 }
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
        }
        spin_unlock_irq(&cwq->lock);
-        if (active)
+        if (active) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
        return active;
 }
@@ -451,6 +572,7 @@ out:
                return 0;
        wait_for_completion(&barr.done);
+        destroy_work_on_stack(&barr.work);
        return 1;
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
                 */
                smp_rmb();
                if (cwq == get_wq_data(work)) {
+                        debug_work_deactivate(work);
                        list_del_init(&work->entry);
                        ret = 1;
                }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        }
        spin_unlock_irq(&cwq->lock);
-        if (unlikely(running))
+        if (unlikely(running)) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
 }
 static void wait_on_work(struct work_struct *work)
@@ -649,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork)
 {
        if (del_timer_sync(&dwork->timer)) {
                struct cpu_workqueue_struct *cwq;
-                cwq = wq_per_cpu(keventd_wq, get_cpu());
+                cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
                __queue_work(cwq, &dwork->work);
                put_cpu();
        }